import os
import re
import json
import logging
import sys
import shutil
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Optional, Set
from dataclasses import dataclass, asdict, field
from datetime import datetime
import traceback
import argparse
try:
from bs4 import BeautifulSoup
except ImportError:
print("ERROR: BeautifulSoup4 not installed. Install with: pip install beautifulsoup4")
sys.exit(1)
from urllib.parse import urlparse, urljoin
class Config:
"""Centralized configuration management."""
def __init__(self, script_dir: Optional[Path] = None):
"""Initialize configuration with automatic path resolution."""
if script_dir is None:
script_dir = Path(__file__).parent.resolve()
self.script_dir = script_dir
self.input_dir = script_dir / "input_html"
self.output_dir = script_dir / "output_configs"
self.config_dir = script_dir / "reference_configs"
self.logs_dir = script_dir / "logs"
self.backup_dir = script_dir / "backups"
self._create_directories()
def _create_directories(self) -> None:
"""Create required directories if they don't exist."""
for directory in [self.input_dir, self.output_dir, self.config_dir,
self.logs_dir, self.backup_dir]:
try:
directory.mkdir(parents=True, exist_ok=True)
except Exception as e:
print(f"WARNING: Could not create directory {directory}: {e}")
class LoliScriptValidator:
"""Validates and corrects LoliScript (.loli) syntax."""
LOLI_COMMANDS = {
"FUNCTION", "REQUEST", "PARSE", "SET", "GIB", "KEYCHAIN",
"NAVIGATE", "BROWSERACTION", "ELEMENTACTION", "EXECUTEJS",
"IF", "ELSE", "ENDIF", "WHILE", "ENDWHILE", "JUMP", "LABEL",
"DELETE", "UTILITY", "FOREACH", "ENDFOREACH", "LOOP", "ENDLOOP",
"PRINT", "BYPASSCF", "TCP", "BEGIN", "END", "SCRIPT", "MOUSEACTION",
"KEYCHECK", "RECAPTCHA", "CAPTCHA", "BLOCK", "ENDBLOCK"
}
LOLI_LOCATORS = {"ID", "CLASS", "SELECTOR", "XPATH", "NAME", "TAG"}
LOLI_ELEMENT_ACTIONS = {
"CLICK", "SENDKEYS", "GETTEXT", "GETATTRIBUTE", "SUBMIT",
"CLEAR", "FOCUS", "HOVER", "DOUBLECLICK", "RIGHTCLICK"
}
LOLI_CONDITIONS = {
"Contains", "Equals", "Regex", "Exists", "NotEqual",
"Match", "EqualTo", "NotContains", "DoesNotExist"
}
def __init__(self, logger: logging.Logger):
self.logger = logger
self.errors: List[str] = []
self.warnings: List[str] = []
self.corrections: List[str] = []
def validate(self, content: str) -> Tuple[bool, str]:
"""Validate and correct LoliScript content."""
self.errors = []
self.warnings = []
self.corrections = []
if not content or not content.strip():
self.errors.append("Empty LoliScript content")
return False, content
lines = content.split("\n")
corrected_lines = []
for line_num, line in enumerate(lines, 1):
try:
corrected_line = self._validate_line(line, line_num)
corrected_lines.append(corrected_line)
except Exception as e:
self.logger.warning(f"Error processing line {line_num}: {e}")
corrected_lines.append(line)
corrected_content = "\n".join(corrected_lines)
self._validate_structure(corrected_content)
success = len(self.errors) == 0
if self.corrections:
self.logger.info(f"LoliScript: Applied {len(self.corrections)} corrections")
if self.warnings:
self.logger.warning(f"LoliScript: {len(self.warnings)} warnings")
if self.errors:
self.logger.error(f"LoliScript: {len(self.errors)} errors")
return success, corrected_content
def _validate_line(self, line: str, line_num: int) -> str:
"""Validate and correct a single line."""
stripped = line.strip()
if not stripped or stripped.startswith("#"):
return line
line = re.sub(r"\s+", " ", line.strip())
line = self._fix_quotes(line)
line = self._validate_command_syntax(line)
return line
def _fix_quotes(self, line: str) -> str:
"""Ensure consistent quote usage."""
line = line.replace(""", '"').replace(""", '"')
line = line.replace("'", "'").replace("'", "'")
return line
def _validate_command_syntax(self, line: str) -> str:
"""Validate command syntax."""
tokens = line.split()
if not tokens:
return line
cmd = tokens[0].upper()
if cmd in self.LOLI_COMMANDS and len(tokens) > 1:
line = f"{cmd} {' '.join(tokens[1:])}"
return line
def _validate_structure(self, content: str) -> None:
"""Validate overall script structure."""
pairs = [
(r"\bIF\b", r"\bENDIF\b", "IF", "ENDIF"),
(r"\bWHILE\b", r"\bENDWHILE\b", "WHILE", "ENDWHILE"),
(r"\bFOREACH\b", r"\bENDFOREACH\b", "FOREACH", "ENDFOREACH"),
(r"\bBLOCK\b", r"\bENDBLOCK\b", "BLOCK", "ENDBLOCK"),
]
for open_pattern, close_pattern, open_name, close_name in pairs:
open_count = len(re.findall(open_pattern, content, re.IGNORECASE))
close_count = len(re.findall(close_pattern, content, re.IGNORECASE))
if open_count != close_count:
self.errors.append(
f"Mismatched {open_name}/{close_name}: {open_count} vs {close_count}"
)
class VBScriptValidator:
"""Validates and corrects VBScript (.svb) syntax."""
VB_KEYWORDS = {
"Sub", "End", "Function", "Dim", "Set", "If", "Then", "Else", "ElseIf",
"EndIf", "For", "Next", "While", "Wend", "Do", "Loop", "Select", "Case",
"Exit", "Call", "Return", "Class", "Private", "Public", "Const", "On",
"Error", "Resume", "With", "Option", "Explicit", "ReDim", "Preserve",
"CreateObject", "GetObject", "IsObject", "IsEmpty", "IsNull", "Nothing"
}
def __init__(self, logger: logging.Logger):
self.logger = logger
self.errors: List[str] = []
self.warnings: List[str] = []
self.corrections: List[str] = []
def validate(self, content: str) -> Tuple[bool, str]:
"""Validate and correct VBScript content."""
self.errors = []
self.warnings = []
self.corrections = []
if not content or not content.strip():
self.errors.append("Empty VBScript content")
return False, content
lines = content.split("\n")
corrected_lines = []
for line_num, line in enumerate(lines, 1):
try:
corrected_line = self._validate_line(line, line_num)
corrected_lines.append(corrected_line)
except Exception as e:
self.logger.warning(f"Error processing line {line_num}: {e}")
corrected_lines.append(line)
corrected_content = "\n".join(corrected_lines)
self._validate_structure(corrected_content)
success = len(self.errors) == 0
if self.corrections:
self.logger.info(f"VBScript: Applied {len(self.corrections)} corrections")
if self.warnings:
self.logger.warning(f"VBScript: {len(self.warnings)} warnings")
if self.errors:
self.logger.error(f"VBScript: {len(self.errors)} errors")
return success, corrected_content
def _validate_line(self, line: str, line_num: int) -> str:
"""Validate and correct a single line."""
stripped = line.strip()
if not stripped or stripped.startswith("'") or stripped.startswith("REM"):
return line
line = re.sub(r"\s*=\s*", " = ", line)
line = re.sub(r"\s*&\s*", " & ", line)
line = re.sub(r"\s*\+\s*", " + ", line)
line = self._fix_quotes(line)
return line
def _fix_quotes(self, line: str) -> str:
"""Ensure consistent quote usage."""
line = line.replace(""", '"').replace(""", '"')
return line
def _validate_structure(self, content: str) -> None:
"""Validate overall VBScript structure."""
pairs = [
(r"\bSub\b", r"\bEnd\s+Sub\b", "Sub", "End Sub"),
(r"\bFunction\b", r"\bEnd\s+Function\b", "Function", "End Function"),
(r"\bIf\b", r"\bEnd\s+If\b", "If", "End If"),
]
for open_pattern, close_pattern, open_name, close_name in pairs:
open_count = len(re.findall(open_pattern, content, re.IGNORECASE))
close_count = len(re.findall(close_pattern, content, re.IGNORECASE))
if open_count != close_count:
self.errors.append(
f"Mismatched {open_name}/{close_name}: {open_count} vs {close_count}"
)
@dataclass
class FormElement:
"""Represents a form element extracted from HTML."""
name: str
type: str
id: Optional[str] = None
selector: Optional[str] = None
placeholder: Optional[str] = None
required: bool = False
def to_dict(self) -> Dict:
"""Convert to dictionary, filtering out None values."""
return {k: v for k, v in asdict(self).items() if v is not None}
@dataclass
class FormData:
"""Represents a complete form extracted from HTML."""
id: Optional[str]
name: Optional[str]
action: str
method: str
selector: str
fields: List[FormElement] = field(default_factory=list)
def to_dict(self) -> Dict:
return {
"id": self.id,
"name": self.name,
"action": self.action,
"method": self.method,
"selector": self.selector,
"fields": [f.to_dict() for f in self.fields]
}
@dataclass
class PageAnalysis:
"""Complete analysis of a web page."""
file: str
domain: str
title: Optional[str]
forms: List[FormData] = field(default_factory=list)
inputs: List[Dict] = field(default_factory=list)
buttons: List[Dict] = field(default_factory=list)
links: List[Dict] = field(default_factory=list)
selectable_elements: List[Dict] = field(default_factory=list)
def to_dict(self) -> Dict:
return {
"file": self.file,
"domain": self.domain,
"title": self.title,
"forms": [f.to_dict() for f in self.forms],
"inputs": self.inputs,
"buttons": self.buttons,
"links": self.links,
"selectable_elements": self.selectable_elements
}
class PatternLearner:
"""Learn config patterns from existing reference files."""
def __init__(self, config_dir: Path, logger: logging.Logger):
self.config_dir = Path(config_dir)
self.logger = logger
self.loli_patterns = self._empty_loli_patterns()
self.svb_patterns = self._empty_svb_patterns()
self.domain_configs: Dict[str, List[Path]] = defaultdict(list)
if self.config_dir.exists():
self._learn_patterns()
else:
self.logger.warning(f"Config directory does not exist: {config_dir}")
def _empty_loli_patterns(self) -> Dict:
"""Return empty LoliScript pattern template."""
return {
"structure": [],
"commands": defaultdict(list),
"variable_format": "<VAR_NAME>",
"request_template": None,
"parse_template": None,
"keycheck_template": None,
"examples": []
}
def _empty_svb_patterns(self) -> Dict:
"""Return empty VBScript pattern template."""
return {
"structure": [],
"functions": [],
"variable_declaration": "Dim {var}",
"http_request": None,
"element_interaction": None,
"examples": []
}
def _learn_patterns(self) -> None:
"""Extract and learn patterns from example files."""
try:
self._learn_loli_patterns()
self._learn_svb_patterns()
self._map_domains_to_configs()
self.logger.info("Pattern learning completed successfully")
except Exception as e:
self.logger.error(f"Error learning patterns: {e}")
self.logger.debug(traceback.format_exc())
def _learn_loli_patterns(self) -> None:
"""Extract and learn LoliScript patterns from .loli files."""
loli_files = list(self.config_dir.glob("*.loli"))
if not loli_files:
self.logger.warning("No .loli files found in config directory")
return
for loli_file in loli_files:
try:
with open(loli_file, 'r', encoding='utf-8') as f:
content = f.read()
lines = content.split('\n')
self.loli_patterns["structure"].extend(lines[:10])
self.loli_patterns["examples"].append(str(loli_file.name))
for line in lines:
tokens = line.split()
if tokens and tokens[0].upper() in LoliScriptValidator.LOLI_COMMANDS:
self.loli_patterns["commands"][tokens[0].upper()].append(line)
except Exception as e:
self.logger.warning(f"Error reading {loli_file}: {e}")
def _learn_svb_patterns(self) -> None:
"""Extract and learn VBScript patterns from .svb files."""
svb_files = list(self.config_dir.glob("*.svb"))
if not svb_files:
self.logger.warning("No .svb files found in config directory")
return
for svb_file in svb_files:
try:
with open(svb_file, 'r', encoding='utf-8') as f:
content = f.read()
lines = content.split('\n')
self.svb_patterns["structure"].extend(lines[:10])
self.svb_patterns["examples"].append(str(svb_file.name))
for line in lines:
if 'Function' in line or 'Sub' in line:
self.svb_patterns["functions"].append(line)
except Exception as e:
self.logger.warning(f"Error reading {svb_file}: {e}")
def _map_domains_to_configs(self) -> None:
"""Map domains to their corresponding config files."""
for config_file in self.config_dir.glob("*.json"):
try:
with open(config_file, 'r', encoding='utf-8') as f:
data = json.load(f)
if 'domain' in data:
self.domain_configs[data['domain']].append(config_file)
except Exception as e:
self.logger.warning(f"Error reading config {config_file}: {e}")
def get_loli_pattern(self, command: str) -> Optional[List[str]]:
"""Retrieve LoliScript patterns for a specific command."""
return self.loli_patterns["commands"].get(command.upper(), [])
def get_svb_pattern(self, function_name: str) -> Optional[List[str]]:
"""Retrieve VBScript patterns for a specific function."""
return [f for f in self.svb_patterns["functions"] if function_name in f]
def get_domain_config(self, domain: str) -> Optional[Path]:
"""Retrieve config file for a specific domain."""
configs = self.domain_configs.get(domain, [])
return configs[0] if configs else None
class HTMLParser:
"""Parse HTML files and extract forms, inputs, buttons, and links."""
def __init__(self, logger: logging.Logger):
self.logger = logger
self.validator_loli = LoliScriptValidator(logger)
self.validator_svb = VBScriptValidator(logger)
def parse_file(self, html_file: Path) -> Optional[PageAnalysis]:
"""Parse an HTML file and extract all relevant elements."""
try:
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
domain = self._extract_domain(html_file.name)
title = soup.title.string if soup.title else None
page_analysis = PageAnalysis(
file=html_file.name,
domain=domain,
title=title
)
page_analysis.forms = self._extract_forms(soup)
page_analysis.inputs = self._extract_inputs(soup)
page_analysis.buttons = self._extract_buttons(soup)
page_analysis.links = self._extract_links(soup)
page_analysis.selectable_elements = self._extract_selectable_elements(soup)
self.logger.info(f"Successfully parsed {html_file.name}")
return page_analysis
except Exception as e:
self.logger.error(f"Error parsing {html_file.name}: {e}")
self.logger.debug(traceback.format_exc())
return None
def _extract_domain(self, filename: str) -> str:
"""Extract domain from filename."""
return filename.split('_')[0] if '_' in filename else filename.split('.')[0]
def _extract_forms(self, soup: BeautifulSoup) -> List[FormData]:
"""Extract all forms from the HTML."""
forms = []
for idx, form in enumerate(soup.find_all('form')):
form_id = form.get('id', f'form_{idx}')
form_name = form.get('name', '')
action = form.get('action', '')
method = form.get('method', 'GET').upper()
selector = self._generate_css_selector(form)
fields = []
for field in form.find_all(['input', 'textarea', 'select']):
field_name = field.get('name', '')
field_type = field.get('type', field.name)
field_id = field.get('id', '')
placeholder = field.get('placeholder', '')
required = field.has_attr('required')
field_selector = self._generate_css_selector(field)
fields.append(FormElement(
name=field_name,
type=field_type,
id=field_id,
selector=field_selector,
placeholder=placeholder,
required=required
))
forms.append(FormData(
id=form_id,
name=form_name,
action=action,
method=method,
selector=selector,
fields=fields
))
return forms
def _extract_inputs(self, soup: BeautifulSoup) -> List[Dict]:
"""Extract all input elements."""
inputs = []
for idx, inp in enumerate(soup.find_all('input')):
inputs.append({
'id': inp.get('id', f'input_{idx}'),
'name': inp.get('name', ''),
'type': inp.get('type', 'text'),
'value': inp.get('value', ''),
'placeholder': inp.get('placeholder', ''),
'required': inp.has_attr('required'),
'selector': self._generate_css_selector(inp)
})
return inputs
def _extract_buttons(self, soup: BeautifulSoup) -> List[Dict]:
"""Extract all button elements."""
buttons = []
for idx, btn in enumerate(soup.find_all(['button', 'input[type="button"]', 'input[type="submit"]'])):0 views