import os import re BASE_DIR = "/var/www/websitecategorizationapi.com/public_html/domains-database" TXT_FILE = "vertical_pages_list.txt" def get_name_variations(filename): """ Returns a list of variations for the link text based on filename. e.g. 'venture_capital.php' -> ['Venture Capital', 'venture capital'] """ name = filename.replace('.php', '') # Base variations v1 = name.replace('_', ' ') # venture capital v2 = v1.title() # Venture Capital # Handle "and" -> "&" v3 = v2.replace(' And ', ' & ') # Venture & Capital v4 = v1.replace(' and ', ' & ') # venture & capital v5 = v2.replace(' And ', ' and ') # Venture and Capital (Title case with lowercase and) # Return unique variants sorted by length (longest first to avoid partial matches if any) variants = sorted(list(set([v1, v2, v3, v4, v5])), key=len, reverse=True) return variants def process_page(page_file): page_path = os.path.join(BASE_DIR, page_file) folder_name = page_file.replace('.php', '') folder_path = os.path.join(BASE_DIR, folder_name) if not os.path.exists(page_path): # print(f"Skipping {page_file}: File not found.") return if not os.path.isdir(folder_path): # print(f"Skipping {page_file}: Folder {folder_name} not found.") return # Get sub-files try: sub_files = [f for f in os.listdir(folder_path) if f.endswith('.php')] except Exception as e: print(f"Error reading folder {folder_path}: {e}") return if not sub_files: return with open(page_path, 'r', encoding='utf-8') as f: content = f.read() # Find the "Subcategories" section # Look for a header containing "Subcategories" or "Sub-Categories" # Capture the header and the content following it until the next section or end of div/container # Regex to find the start of the section # We look for ]*>.*?(?:Sub-?categories).*? header_pattern = re.compile(r'(]*>.*?(?:Sub-?categories).*?)', re.IGNORECASE | re.DOTALL) match = header_pattern.search(content) if not match: # print(f"No 'Subcategories' section found in {page_file}") return header_end_pos = match.end() # Find the end of this section. # Heuristic: The next

tag next_section = re.search(r' or whitespace ) # Group 2: The text # Group 3: Following character ( < or whitespace ) pattern = re.compile(r'(>\s*)(' + escaped_text + r')(\s*<)', re.IGNORECASE) # Check if link already exists in the section # We check if the exact href is already present to avoid re-linking or double-linking if re.search(r'href=["\"]' + re.escape(link_href) + r'["\"]', modified_section): continue def replacement(m): # Ensure we are not inside an tag. # This is a simplistic check: search for the match text in the modified section again # and check if it's wrapped. But we are inside the substitution callback. # Given our pattern `>Text<`, we are safe from matching attribute values. # The main risk is `Text`. # But we checked `href` above. # What if `Text` exists but points to wrong URL? We might double wrap. # But since we are looking for Titles, we assume they are initially unlinked. return f'{m.group(1)}{m.group(2)}{m.group(3)}' new_section = pattern.sub(replacement, modified_section) if new_section != modified_section: modified_section = new_section changes_made = True print(f"Linked '{text}' in {page_file}") break if changes_made: final_content = pre_section + modified_section + post_section with open(page_path, 'w', encoding='utf-8') as f: f.write(final_content) # print(f"Updated {page_file}") def main(): if not os.path.exists(TXT_FILE): print(f"Error: {TXT_FILE} not found.") return with open(TXT_FILE, 'r') as f: files = [line.strip() for line in f if line.strip()] print(f"Processing {len(files)} files from {TXT_FILE}...") for file in files: process_page(file) print("Done.") if __name__ == "__main__": main()