import os
import re

BASE_DIR = "/var/www/websitecategorizationapi.com/public_html/domains-database"
TXT_FILE = "vertical_pages_list.txt"

def get_name_variations(filename):
    """
    Returns a list of variations for the link text based on filename.
    e.g. 'venture_capital.php' -> ['Venture Capital', 'venture capital']
    """
    name = filename.replace('.php', '')
    
    # Base variations
    v1 = name.replace('_', ' ') # venture capital
    v2 = v1.title() # Venture Capital
    
    # Handle "and" -> "&"
    v3 = v2.replace(' And ', ' & ') # Venture & Capital
    v4 = v1.replace(' and ', ' & ') # venture & capital
    v5 = v2.replace(' And ', ' and ') # Venture and Capital (Title case with lowercase and)

    # Return unique variants sorted by length (longest first to avoid partial matches if any)
    variants = sorted(list(set([v1, v2, v3, v4, v5])), key=len, reverse=True)
    return variants

def process_page(page_file):
    page_path = os.path.join(BASE_DIR, page_file)
    folder_name = page_file.replace('.php', '')
    folder_path = os.path.join(BASE_DIR, folder_name)

    if not os.path.exists(page_path):
        # print(f"Skipping {page_file}: File not found.")
        return

    if not os.path.isdir(folder_path):
        # print(f"Skipping {page_file}: Folder {folder_name} not found.")
        return

    # Get sub-files
    try:
        sub_files = [f for f in os.listdir(folder_path) if f.endswith('.php')]
    except Exception as e:
        print(f"Error reading folder {folder_path}: {e}")
        return

    if not sub_files:
        return

    with open(page_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Find the "Subcategories" section
    # Look for a header containing "Subcategories" or "Sub-Categories"
    # Capture the header and the content following it until the next section or end of div/container
    
    # Regex to find the start of the section
    # We look for <h[1-6][^>]*>.*?(?:Sub-?categories).*?</h[1-6]>
    header_pattern = re.compile(r'(<h[1-6][^>]*>.*?(?:Sub-?categories).*?</h[1-6]>)', re.IGNORECASE | re.DOTALL)
    
    match = header_pattern.search(content)
    if not match:
        # print(f"No 'Subcategories' section found in {page_file}")
        return

    header_end_pos = match.end()
    
    # Find the end of this section. 
    # Heuristic: The next <section> tag
    next_section = re.search(r'<section', content[header_end_pos:], re.IGNORECASE)
    
    if next_section:
        section_end_pos = header_end_pos + next_section.start()
    else:
        section_end_pos = len(content)

    # Extract the target content area
    pre_section = content[:header_end_pos]
    target_section = content[header_end_pos:section_end_pos]
    post_section = content[section_end_pos:]
    
    modified_section = target_section
    changes_made = False

    for sub_file in sub_files:
        variations = get_name_variations(sub_file)
        link_href = f"/domains-database/{folder_name}/{sub_file}"
        
        for text in variations:
            # Escape text for regex
            escaped_text = re.escape(text)
            
            # Pattern: 
            # Group 1: Preceding character ( > or whitespace )
            # Group 2: The text
            # Group 3: Following character ( < or whitespace )
            pattern = re.compile(r'(>\s*)(' + escaped_text + r')(\s*<)', re.IGNORECASE)
            
            # Check if link already exists in the section
            # We check if the exact href is already present to avoid re-linking or double-linking
            if re.search(r'href=["\"]' + re.escape(link_href) + r'["\"]', modified_section):
                continue

            def replacement(m):
                # Ensure we are not inside an <a> tag. 
                # This is a simplistic check: search for the match text in the modified section again 
                # and check if it's wrapped. But we are inside the substitution callback.
                # Given our pattern `>Text<`, we are safe from matching attribute values.
                # The main risk is `<a>Text</a>`.
                # But we checked `href` above. 
                # What if `<a>Text</a>` exists but points to wrong URL? We might double wrap.
                # But since we are looking for Titles, we assume they are initially unlinked.
                
                return f'{m.group(1)}<a href="{link_href}" style="color: var(--text-primary); text-decoration: none;">{m.group(2)}</a>{m.group(3)}'

            new_section = pattern.sub(replacement, modified_section)
            if new_section != modified_section:
                modified_section = new_section
                changes_made = True
                print(f"Linked '{text}' in {page_file}")
                break

    if changes_made:
        final_content = pre_section + modified_section + post_section
        with open(page_path, 'w', encoding='utf-8') as f:
            f.write(final_content)
        # print(f"Updated {page_file}")

def main():
    if not os.path.exists(TXT_FILE):
        print(f"Error: {TXT_FILE} not found.")
        return

    with open(TXT_FILE, 'r') as f:
        files = [line.strip() for line in f if line.strip()]

    print(f"Processing {len(files)} files from {TXT_FILE}...")
    for file in files:
        process_page(file)
    print("Done.")

if __name__ == "__main__":
    main()
