Untitled Page


import os
import re
import json
import logging
import sys
import shutil
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Optional, Set
from dataclasses import dataclass, asdict, field
from datetime import datetime
import traceback
import argparse

try:
    from bs4 import BeautifulSoup
except ImportError:
    print("ERROR: BeautifulSoup4 not installed. Install with: pip install beautifulsoup4")
    sys.exit(1)

from urllib.parse import urlparse, urljoin

class Config:
    """Centralized configuration management."""

    def __init__(self, script_dir: Optional[Path] = None):
        """Initialize configuration with automatic path resolution."""
        if script_dir is None:
            script_dir = Path(__file__).parent.resolve()
        
        self.script_dir = script_dir
        self.input_dir = script_dir / "input_html"
        self.output_dir = script_dir / "output_configs"
        self.config_dir = script_dir / "reference_configs"
        self.logs_dir = script_dir / "logs"
        self.backup_dir = script_dir / "backups"
        
        self._create_directories()
        
    def _create_directories(self) -> None:
        """Create required directories if they don't exist."""
        for directory in [self.input_dir, self.output_dir, self.config_dir, 
                         self.logs_dir, self.backup_dir]:
            try:
                directory.mkdir(parents=True, exist_ok=True)
            except Exception as e:
                print(f"WARNING: Could not create directory {directory}: {e}")

class LoliScriptValidator:
    """Validates and corrects LoliScript (.loli) syntax."""

    LOLI_COMMANDS = {
        "FUNCTION", "REQUEST", "PARSE", "SET", "GIB", "KEYCHAIN",
        "NAVIGATE", "BROWSERACTION", "ELEMENTACTION", "EXECUTEJS",
        "IF", "ELSE", "ENDIF", "WHILE", "ENDWHILE", "JUMP", "LABEL",
        "DELETE", "UTILITY", "FOREACH", "ENDFOREACH", "LOOP", "ENDLOOP",
        "PRINT", "BYPASSCF", "TCP", "BEGIN", "END", "SCRIPT", "MOUSEACTION",
        "KEYCHECK", "RECAPTCHA", "CAPTCHA", "BLOCK", "ENDBLOCK"
    }

    LOLI_LOCATORS = {"ID", "CLASS", "SELECTOR", "XPATH", "NAME", "TAG"}
    LOLI_ELEMENT_ACTIONS = {
        "CLICK", "SENDKEYS", "GETTEXT", "GETATTRIBUTE", "SUBMIT",
        "CLEAR", "FOCUS", "HOVER", "DOUBLECLICK", "RIGHTCLICK"
    }
    LOLI_CONDITIONS = {
        "Contains", "Equals", "Regex", "Exists", "NotEqual",
        "Match", "EqualTo", "NotContains", "DoesNotExist"
    }

    def __init__(self, logger: logging.Logger):
        self.logger = logger
        self.errors: List[str] = []
        self.warnings: List[str] = []
        self.corrections: List[str] = []

    def validate(self, content: str) -> Tuple[bool, str]:
        """Validate and correct LoliScript content."""
        self.errors = []
        self.warnings = []
        self.corrections = []

        if not content or not content.strip():
            self.errors.append("Empty LoliScript content")
            return False, content

        lines = content.split("\n")
        corrected_lines = []

        for line_num, line in enumerate(lines, 1):
            try:
                corrected_line = self._validate_line(line, line_num)
                corrected_lines.append(corrected_line)
            except Exception as e:
                self.logger.warning(f"Error processing line {line_num}: {e}")
                corrected_lines.append(line)

        corrected_content = "\n".join(corrected_lines)
        self._validate_structure(corrected_content)

        success = len(self.errors) == 0

        if self.corrections:
            self.logger.info(f"LoliScript: Applied {len(self.corrections)} corrections")
        if self.warnings:
            self.logger.warning(f"LoliScript: {len(self.warnings)} warnings")
        if self.errors:
            self.logger.error(f"LoliScript: {len(self.errors)} errors")

        return success, corrected_content

    def _validate_line(self, line: str, line_num: int) -> str:
        """Validate and correct a single line."""
        stripped = line.strip()

        if not stripped or stripped.startswith("#"):
            return line

        line = re.sub(r"\s+", " ", line.strip())
        line = self._fix_quotes(line)
        line = self._validate_command_syntax(line)

        return line

    def _fix_quotes(self, line: str) -> str:
        """Ensure consistent quote usage."""
        line = line.replace(""", '"').replace(""", '"')
        line = line.replace("'", "'").replace("'", "'")
        return line

    def _validate_command_syntax(self, line: str) -> str:
        """Validate command syntax."""
        tokens = line.split()
        if not tokens:
            return line

        cmd = tokens[0].upper()
        if cmd in self.LOLI_COMMANDS and len(tokens) > 1:
            line = f"{cmd} {' '.join(tokens[1:])}"

        return line

    def _validate_structure(self, content: str) -> None:
        """Validate overall script structure."""
        pairs = [
            (r"\bIF\b", r"\bENDIF\b", "IF", "ENDIF"),
            (r"\bWHILE\b", r"\bENDWHILE\b", "WHILE", "ENDWHILE"),
            (r"\bFOREACH\b", r"\bENDFOREACH\b", "FOREACH", "ENDFOREACH"),
            (r"\bBLOCK\b", r"\bENDBLOCK\b", "BLOCK", "ENDBLOCK"),
        ]

        for open_pattern, close_pattern, open_name, close_name in pairs:
            open_count = len(re.findall(open_pattern, content, re.IGNORECASE))
            close_count = len(re.findall(close_pattern, content, re.IGNORECASE))

            if open_count != close_count:
                self.errors.append(
                    f"Mismatched {open_name}/{close_name}: {open_count} vs {close_count}"
                )

class VBScriptValidator:
    """Validates and corrects VBScript (.svb) syntax."""

    VB_KEYWORDS = {
        "Sub", "End", "Function", "Dim", "Set", "If", "Then", "Else", "ElseIf",
        "EndIf", "For", "Next", "While", "Wend", "Do", "Loop", "Select", "Case",
        "Exit", "Call", "Return", "Class", "Private", "Public", "Const", "On",
        "Error", "Resume", "With", "Option", "Explicit", "ReDim", "Preserve",
        "CreateObject", "GetObject", "IsObject", "IsEmpty", "IsNull", "Nothing"
    }

    def __init__(self, logger: logging.Logger):
        self.logger = logger
        self.errors: List[str] = []
        self.warnings: List[str] = []
        self.corrections: List[str] = []

    def validate(self, content: str) -> Tuple[bool, str]:
        """Validate and correct VBScript content."""
        self.errors = []
        self.warnings = []
        self.corrections = []

        if not content or not content.strip():
            self.errors.append("Empty VBScript content")
            return False, content

        lines = content.split("\n")
        corrected_lines = []

        for line_num, line in enumerate(lines, 1):
            try:
                corrected_line = self._validate_line(line, line_num)
                corrected_lines.append(corrected_line)
            except Exception as e:
                self.logger.warning(f"Error processing line {line_num}: {e}")
                corrected_lines.append(line)

        corrected_content = "\n".join(corrected_lines)
        self._validate_structure(corrected_content)

        success = len(self.errors) == 0

        if self.corrections:
            self.logger.info(f"VBScript: Applied {len(self.corrections)} corrections")
        if self.warnings:
            self.logger.warning(f"VBScript: {len(self.warnings)} warnings")
        if self.errors:
            self.logger.error(f"VBScript: {len(self.errors)} errors")

        return success, corrected_content

    def _validate_line(self, line: str, line_num: int) -> str:
        """Validate and correct a single line."""
        stripped = line.strip()

        if not stripped or stripped.startswith("'") or stripped.startswith("REM"):
            return line

        line = re.sub(r"\s*=\s*", " = ", line)
        line = re.sub(r"\s*&\s*", " & ", line)
        line = re.sub(r"\s*\+\s*", " + ", line)
        line = self._fix_quotes(line)

        return line

    def _fix_quotes(self, line: str) -> str:
        """Ensure consistent quote usage."""
        line = line.replace(""", '"').replace(""", '"')
        return line

    def _validate_structure(self, content: str) -> None:
        """Validate overall VBScript structure."""
        pairs = [
            (r"\bSub\b", r"\bEnd\s+Sub\b", "Sub", "End Sub"),
            (r"\bFunction\b", r"\bEnd\s+Function\b", "Function", "End Function"),
            (r"\bIf\b", r"\bEnd\s+If\b", "If", "End If"),
        ]

        for open_pattern, close_pattern, open_name, close_name in pairs:
            open_count = len(re.findall(open_pattern, content, re.IGNORECASE))
            close_count = len(re.findall(close_pattern, content, re.IGNORECASE))

            if open_count != close_count:
                self.errors.append(
                    f"Mismatched {open_name}/{close_name}: {open_count} vs {close_count}"
                )

@dataclass
class FormElement:
    """Represents a form element extracted from HTML."""
    name: str
    type: str
    id: Optional[str] = None
    selector: Optional[str] = None
    placeholder: Optional[str] = None
    required: bool = False

    def to_dict(self) -> Dict:
        """Convert to dictionary, filtering out None values."""
        return {k: v for k, v in asdict(self).items() if v is not None}

@dataclass
class FormData:
    """Represents a complete form extracted from HTML."""
    id: Optional[str]
    name: Optional[str]
    action: str
    method: str
    selector: str
    fields: List[FormElement] = field(default_factory=list)

    def to_dict(self) -> Dict:
        return {
            "id": self.id,
            "name": self.name,
            "action": self.action,
            "method": self.method,
            "selector": self.selector,
            "fields": [f.to_dict() for f in self.fields]
        }

@dataclass
class PageAnalysis:
    """Complete analysis of a web page."""
    file: str
    domain: str
    title: Optional[str]
    forms: List[FormData] = field(default_factory=list)
    inputs: List[Dict] = field(default_factory=list)
    buttons: List[Dict] = field(default_factory=list)
    links: List[Dict] = field(default_factory=list)
    selectable_elements: List[Dict] = field(default_factory=list)

    def to_dict(self) -> Dict:
        return {
            "file": self.file,
            "domain": self.domain,
            "title": self.title,
            "forms": [f.to_dict() for f in self.forms],
            "inputs": self.inputs,
            "buttons": self.buttons,
            "links": self.links,
            "selectable_elements": self.selectable_elements
        }

class PatternLearner:
    """Learn config patterns from existing reference files."""

    def __init__(self, config_dir: Path, logger: logging.Logger):
        self.config_dir = Path(config_dir)
        self.logger = logger
        self.loli_patterns = self._empty_loli_patterns()
        self.svb_patterns = self._empty_svb_patterns()
        self.domain_configs: Dict[str, List[Path]] = defaultdict(list)
        
        if self.config_dir.exists():
            self._learn_patterns()
        else:
            self.logger.warning(f"Config directory does not exist: {config_dir}")

    def _empty_loli_patterns(self) -> Dict:
        """Return empty LoliScript pattern template."""
        return {
            "structure": [],
            "commands": defaultdict(list),
            "variable_format": "<VAR_NAME>",
            "request_template": None,
            "parse_template": None,
            "keycheck_template": None,
            "examples": []
        }

    def _empty_svb_patterns(self) -> Dict:
        """Return empty VBScript pattern template."""
        return {
            "structure": [],
            "functions": [],
            "variable_declaration": "Dim {var}",
            "http_request": None,
            "element_interaction": None,
            "examples": []
        }

    def _learn_patterns(self) -> None:
        """Extract and learn patterns from example files."""
        try:
            self._learn_loli_patterns()
            self._learn_svb_patterns()
            self._map_domains_to_configs()
            self.logger.info("Pattern learning completed successfully")
        except Exception as e:
            self.logger.error(f"Error learning patterns: {e}")
            self.logger.debug(traceback.format_exc())

    def _learn_loli_patterns(self) -> None:
        """Extract and learn LoliScript patterns from .loli files."""
        loli_files = list(self.config_dir.glob("*.loli"))
        if not loli_files:
            self.logger.warning("No .loli files found in config directory")
            return

        for loli_file in loli_files:
            try:
                with open(loli_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                lines = content.split('\n')
                self.loli_patterns["structure"].extend(lines[:10])
                self.loli_patterns["examples"].append(str(loli_file.name))
                
                for line in lines:
                    tokens = line.split()
                    if tokens and tokens[0].upper() in LoliScriptValidator.LOLI_COMMANDS:
                        self.loli_patterns["commands"][tokens[0].upper()].append(line)
            except Exception as e:
                self.logger.warning(f"Error reading {loli_file}: {e}")

    def _learn_svb_patterns(self) -> None:
        """Extract and learn VBScript patterns from .svb files."""
        svb_files = list(self.config_dir.glob("*.svb"))
        if not svb_files:
            self.logger.warning("No .svb files found in config directory")
            return

        for svb_file in svb_files:
            try:
                with open(svb_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                lines = content.split('\n')
                self.svb_patterns["structure"].extend(lines[:10])
                self.svb_patterns["examples"].append(str(svb_file.name))
                
                for line in lines:
                    if 'Function' in line or 'Sub' in line:
                        self.svb_patterns["functions"].append(line)
            except Exception as e:
                self.logger.warning(f"Error reading {svb_file}: {e}")

    def _map_domains_to_configs(self) -> None:
        """Map domains to their corresponding config files."""
        for config_file in self.config_dir.glob("*.json"):
            try:
                with open(config_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    if 'domain' in data:
                        self.domain_configs[data['domain']].append(config_file)
            except Exception as e:
                self.logger.warning(f"Error reading config {config_file}: {e}")

    def get_loli_pattern(self, command: str) -> Optional[List[str]]:
        """Retrieve LoliScript patterns for a specific command."""
        return self.loli_patterns["commands"].get(command.upper(), [])

    def get_svb_pattern(self, function_name: str) -> Optional[List[str]]:
        """Retrieve VBScript patterns for a specific function."""
        return [f for f in self.svb_patterns["functions"] if function_name in f]

    def get_domain_config(self, domain: str) -> Optional[Path]:
        """Retrieve config file for a specific domain."""
        configs = self.domain_configs.get(domain, [])
        return configs[0] if configs else None

class HTMLParser:
    """Parse HTML files and extract forms, inputs, buttons, and links."""

    def __init__(self, logger: logging.Logger):
        self.logger = logger
        self.validator_loli = LoliScriptValidator(logger)
        self.validator_svb = VBScriptValidator(logger)

    def parse_file(self, html_file: Path) -> Optional[PageAnalysis]:
        """Parse an HTML file and extract all relevant elements."""
        try:
            with open(html_file, 'r', encoding='utf-8') as f:
                html_content = f.read()
            
            soup = BeautifulSoup(html_content, 'html.parser')
            domain = self._extract_domain(html_file.name)
            title = soup.title.string if soup.title else None
            
            page_analysis = PageAnalysis(
                file=html_file.name,
                domain=domain,
                title=title
            )
            
            page_analysis.forms = self._extract_forms(soup)
            page_analysis.inputs = self._extract_inputs(soup)
            page_analysis.buttons = self._extract_buttons(soup)
            page_analysis.links = self._extract_links(soup)
            page_analysis.selectable_elements = self._extract_selectable_elements(soup)
            
            self.logger.info(f"Successfully parsed {html_file.name}")
            return page_analysis
            
        except Exception as e:
            self.logger.error(f"Error parsing {html_file.name}: {e}")
            self.logger.debug(traceback.format_exc())
            return None

    def _extract_domain(self, filename: str) -> str:
        """Extract domain from filename."""
        return filename.split('_')[0] if '_' in filename else filename.split('.')[0]

    def _extract_forms(self, soup: BeautifulSoup) -> List[FormData]:
        """Extract all forms from the HTML."""
        forms = []
        for idx, form in enumerate(soup.find_all('form')):
            form_id = form.get('id', f'form_{idx}')
            form_name = form.get('name', '')
            action = form.get('action', '')
            method = form.get('method', 'GET').upper()
            selector = self._generate_css_selector(form)
            
            fields = []
            for field in form.find_all(['input', 'textarea', 'select']):
                field_name = field.get('name', '')
                field_type = field.get('type', field.name)
                field_id = field.get('id', '')
                placeholder = field.get('placeholder', '')
                required = field.has_attr('required')
                field_selector = self._generate_css_selector(field)
                
                fields.append(FormElement(
                    name=field_name,
                    type=field_type,
                    id=field_id,
                    selector=field_selector,
                    placeholder=placeholder,
                    required=required
                ))
            
            forms.append(FormData(
                id=form_id,
                name=form_name,
                action=action,
                method=method,
                selector=selector,
                fields=fields
            ))
        
        return forms

    def _extract_inputs(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract all input elements."""
        inputs = []
        for idx, inp in enumerate(soup.find_all('input')):
            inputs.append({
                'id': inp.get('id', f'input_{idx}'),
                'name': inp.get('name', ''),
                'type': inp.get('type', 'text'),
                'value': inp.get('value', ''),
                'placeholder': inp.get('placeholder', ''),
                'required': inp.has_attr('required'),
                'selector': self._generate_css_selector(inp)
            })
        return inputs

    def _extract_buttons(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract all button elements."""
        buttons = []
        for idx, btn in enumerate(soup.find_all(['button', 'input[type="button"]', 'input[type="submit"]'])):