Update generate-image-previews.py

D David Veksler Ā· 1 year ago 306d87b578841a8be41e9c16a62a866fc96ffb54
Parent: c9e815d14

1 file changed +219 āˆ’442

Diff

diff --git a/generate-image-previews.py b/generate-image-previews.py
index e7e4ba6..c1de4a5 100644
--- a/generate-image-previews.py
+++ b/generate-image-previews.py
@@ -1,467 +1,244 @@
-import sys
-import subprocess
-
-# Install dependencies
-subprocess.run([sys.executable, "-m", "pip", "install", "beautifulsoup4", "lxml", "playwright"], check=True)
-subprocess.run([sys.executable, "-m", "playwright", "install", "chromium"], check=True)
-
-
-# Now, re-declare and run the full script
 import logging
+import argparse
 from pathlib import Path
-from typing import List, Tuple, Dict, Any, Optional
-from urllib.parse import urlparse, urlunparse
-
-from bs4 import BeautifulSoup
-from playwright.sync_api import sync_playwright, Error as PlaywrightError
-
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-
-# --- Constants for Meta Tags ---
+from typing import List, Dict, Set
+from urllib.parse import urlparse
+
+# Optional dependency loading
+try:
+    from playwright.sync_api import sync_playwright, Error as PlaywrightError
+    PLAYWRIGHT_AVAILABLE = True
+except ImportError:
+    PLAYWRIGHT_AVAILABLE = False
+
+try:
+    from bs4 import BeautifulSoup, FeatureNotFound
+except ImportError:
+    print("Error: BeautifulSoup is not installed. Please run: python3 -m pip install beautifulsoup4")
+    exit(1)
+
+# --- Configuration & Constants ---
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
 OG_TYPE_DEFAULT = "website"
 TWITTER_CARD_DEFAULT = "summary_large_image"
-DEFAULT_DESCRIPTION_PLACEHOLDER = "Read more about {} on our site."
-
-# --- Helper Functions for HTML Manipulation ---
-
-def _ensure_head_exists(soup: BeautifulSoup, html_path: Path) -> Optional[BeautifulSoup]:
-    """Ensures a <head> tag exists in the soup, creating it if necessary. Returns head tag or None."""
-    if not soup.head:
-        if soup.html:
-            head_tag = soup.new_tag("head")
-            soup.html.insert(0, head_tag)
-            logging.debug(f"Created <head> for {html_path.name}")
-            return head_tag
-        else:
-            logging.warning(
-                f"No <html> tag in {html_path.name}, cannot create <head>. Skipping meta tag additions for this file."
-            )
-            return None
-    return soup.head
-
-def _get_or_create_meta_tag(
-    soup: BeautifulSoup,
-    head: BeautifulSoup,
-    attributes: Dict[str, str],
-    content_value: Optional[str] = None,
-) -> Tuple[BeautifulSoup, bool]:
-    """
-    Finds a meta tag with given attributes or creates it if not found.
-    Updates content if content_value is provided, tag exists, and content differs.
-    Returns the tag and a boolean indicating if a change was made (created or content updated).
-    """
-    tag = head.find("meta", attrs=attributes)
-    changed = False
-    if not tag:
-        tag = soup.new_tag("meta", attrs=attributes)
-        if content_value is not None:
-            tag["content"] = content_value
-        head.append(tag)
-        changed = True
-    elif content_value is not None and tag.get("content") != content_value:
-        tag["content"] = content_value
-        changed = True
-    return tag, changed
-
-def _get_or_create_link_tag(
-    soup: BeautifulSoup,
-    head: BeautifulSoup,
-    attributes: Dict[str, str],
-    href_value: Optional[str] = None,
-) -> Tuple[BeautifulSoup, bool]:
-    """
-    Finds a link tag with given attributes or creates it if not found.
-    Updates href if href_value is provided, tag exists, and href differs.
-    Returns the tag and a boolean indicating if a change was made.
-    """
-    tag = head.find("link", attrs=attributes)
-    changed = False
-    if not tag:
-        tag = soup.new_tag("link", attrs=attributes)
-        if href_value is not None:
-            tag["href"] = href_value
-        insert_position = 0
-        for i, child in enumerate(list(head.children)):
-            if hasattr(child, 'name') and child.name == 'meta':
-                if child.get('charset') or child.get('name') == 'viewport':
-                    insert_position = i + 1
-
-        if not head.contents:
-            head.append(tag)
-        elif insert_position < len(head.contents):
-            head.insert(insert_position, tag)
-        else:
-            head.append(tag)
-        changed = True
-    elif href_value is not None and tag.get("href") != href_value:
-        tag["href"] = href_value
-        changed = True
-    return tag, changed
-
-
-def _extract_content(soup: BeautifulSoup, tag_name: str = "h1", default_text: str = "") -> str:
-    """Extracts text content from the first occurrence of a specified tag."""
-    element = soup.find(tag_name)
-    if element:
-        text_content = element.get_text(separator=' ', strip=True)
-        if text_content:
-            return text_content
-    return default_text
-
-# --- Core Logic Functions ---
-
-def analyze_html_file(
-    html_file: Path, base_url: str, http_base_url_netloc: str
-) -> Dict[str, Any]:
-    """
-    Analyzes a single HTML file for missing/incorrect SEO and social media meta tags.
-    Returns a dictionary with analysis results, including found tag objects.
-    """
-    analysis: Dict[str, Any] = {"path": html_file, "needs_update": False}
-    file_stem = html_file.stem
-    expected_image_name = f"{file_stem}.png"
-    expected_image_url = f"images/{expected_image_name}"
-    expected_canonical_url = f"{base_url}{html_file.name}"
-
-    try:
-        with open(html_file, "r", encoding="utf-8", errors="replace") as f:
-            content = f.read()
-            if not content.strip():
-                logging.warning(f"File {html_file.name} is empty or whitespace only. Skipping analysis.")
-                analysis["error"] = "Empty file"
-                return analysis
-            soup = BeautifulSoup(content, "lxml")
-
-        analysis["soup"] = soup # Store soup for use in update function if needed for extraction
-
-        # --- Find existing tags ---
-        og_image_tag = soup.find("meta", property="og:image")
-        twitter_image_tag = soup.find("meta", attrs={"name": "twitter:image"})
-        og_title_tag = soup.find("meta", property="og:title")
-        twitter_title_tag = soup.find("meta", attrs={"name": "twitter:title"})
-        og_description_tag = soup.find("meta", property="og:description")
-        twitter_description_tag = soup.find("meta", attrs={"name": "twitter:description"})
-        meta_description_tag = soup.find("meta", attrs={"name": "description"})
-        og_type_tag = soup.find("meta", property="og:type")
-        og_url_tag = soup.find("meta", property="og:url")
-        twitter_card_tag = soup.find("meta", attrs={"name": "twitter:card"})
-        canonical_link_tag = soup.find("link", rel="canonical")
-
-        # --- Determine if updates are needed ---
-
-        # 1. Social Media Image Tags & File
-        dir_path = html_file.parent
-        expected_image_path = dir_path / expected_image_url
-
-        if not og_image_tag or og_image_tag.get("content") != expected_image_url:
-            analysis["og_image_needs_update"] = True
-            analysis["needs_update"] = True
-            logging.info(f"og:image tag is missing or incorrect for: {html_file.name}")
-
-        if not twitter_image_tag or twitter_image_tag.get("content") != expected_image_url:
-            analysis["twitter_image_needs_update"] = True
-            analysis["needs_update"] = True
-            logging.info(f"twitter:image tag is missing or incorrect for: {html_file.name}")
-
-        # A screenshot is required if the target image file doesn't exist on disk,
-        # or if we are about to update the tags to point to it.
-        if not expected_image_path.exists() or analysis.get("og_image_needs_update") or analysis.get("twitter_image_needs_update"):
-            analysis["needs_screenshot"] = True
-            analysis["expected_image_name"] = expected_image_name
-            if not expected_image_path.exists():
-                logging.info(f"Image file '{expected_image_name}' is missing for {html_file.name}, will generate.")
-            else: # Tags are being updated, so we'll refresh the screenshot to be safe.
-                logging.info(f"Image tags for {html_file.name} are incorrect, will generate fresh screenshot.")
-
-
-        # 2. Title Tags (Add if missing)
-        if not og_title_tag:
-            analysis["og_title_missing"] = True
-            analysis["needs_update"] = True
-        if not twitter_title_tag:
-            analysis["twitter_title_missing"] = True
-            analysis["needs_update"] = True
-
-        # 3. Description Tags (Add if missing)
-        if not og_description_tag:
-            analysis["og_description_missing"] = True
-            analysis["needs_update"] = True
-        if not twitter_description_tag:
-            analysis["twitter_description_missing"] = True
-            analysis["needs_update"] = True
-        if not meta_description_tag:
-            analysis["meta_description_missing"] = True
-            analysis["needs_update"] = True
-
-        # 4. Other Social Tags (Add if missing, or update content if present and incorrect)
-        if not og_type_tag or og_type_tag.get("content") != OG_TYPE_DEFAULT:
-            analysis["og_type_needs_update"] = True
-            analysis["needs_update"] = True
-        if not og_url_tag or og_url_tag.get("content") != expected_canonical_url:
-            analysis["og_url_needs_update"] = True
-            analysis["needs_update"] = True
-        if not twitter_card_tag or twitter_card_tag.get("content") != TWITTER_CARD_DEFAULT:
-            analysis["twitter_card_needs_update"] = True
-            analysis["needs_update"] = True
-
-        # 5. Canonical URL (Add if missing, or update href if present and incorrect)
-        current_canonical_href = canonical_link_tag.get("href", "") if canonical_link_tag else ""
-        needs_canonical_fix = False
-        if not canonical_link_tag:
-            needs_canonical_fix = True
-            logging.info(f"Missing canonical URL: {html_file.name}")
-        else:
-            parsed_current_href = urlparse(current_canonical_href)
-            is_http_on_base_domain = (parsed_current_href.scheme == "http" and
-                                      parsed_current_href.netloc.lower() == http_base_url_netloc.lower())
-            is_different_url = (current_canonical_href != expected_canonical_url)
-            if is_http_on_base_domain or is_different_url:
-                needs_canonical_fix = True
-                logging.info(
-                    f"Incorrect canonical URL: {html_file.name} (Current: '{current_canonical_href}', Expected: '{expected_canonical_url}')"
-                )
-        if needs_canonical_fix:
-            analysis["canonical_needs_fix"] = True
-            analysis["expected_canonical_url"] = expected_canonical_url
-            analysis["needs_update"] = True
-
-    except IOError as e:
-        logging.error(f"IOError parsing {html_file.name}: {e}")
-        analysis["error"] = str(e)
-    except Exception as e:
-        logging.error(f"Unexpected error parsing {html_file.name} ({type(e).__name__}): {e}")
-        analysis["error"] = str(e)
-        if "soup" in analysis: del analysis["soup"] # Don't pass potentially corrupt soup
-
-    return analysis
-
-def generate_screenshots(
-    files_for_screenshot: List[Dict[str, Any]], images_dir: Path
-):
-    """Generate screenshots for HTML files that need them."""
-    if not files_for_screenshot:
-        logging.info("No screenshots needed.")
-        return
 
-    images_dir.mkdir(exist_ok=True)
-    logging.info(f"Attempting to launch browser for {len(files_for_screenshot)} screenshots...")
+class Colors:
+    HEADER, BLUE, GREEN, YELLOW, RED, ENDC, BOLD, UNDERLINE = '\033[95m', '\033[94m', '\033[92m', '\033[93m', '\033[91m', '\033[0m', '\033[1m', '\033[4m'
 
+def get_parser():
+    """Determines the best available HTML parser."""
     try:
-        with sync_playwright() as p:
-            try:
-                browser = p.chromium.launch(headless=True)
-            except PlaywrightError as e:
-                logging.error(f"Failed to launch Chromium. Ensure Playwright browsers are installed ('playwright install chromium'). Error: {e}")
-                return
-
-            page = browser.new_page(viewport={"width": 1200, "height": 630})
-            page.set_extra_http_headers(
-                {"User-Agent": "Mozilla/5.0 (compatible; PreviewBot/1.0; +https://cheatsheets.davidveksler.com/)"}
-            )
-
-            for file_info in files_for_screenshot:
-                html_path = file_info["path"]
-                image_name = file_info.get("expected_image_name", f"{html_path.stem}.png")
-                image_path = images_dir / image_name
-                try:
-                    page.goto(f"file://{html_path.resolve()}", wait_until="networkidle")
-                    page.evaluate(
-                        """
-                        ['cookie-banner', 'cookie-notice', 'gdpr-banner', 'privacy-popup'].forEach(cls => {
-                            document.querySelectorAll(`.${cls}`).forEach(el => el.style.display = 'none');
-                        });
-                    """
-                    )
-                    page.screenshot(path=image_path, type="png")
-                    logging.info(f"Generated screenshot: {image_path.name}")
-                except PlaywrightError as e:
-                    logging.error(
-                        f"Playwright screenshot failed for {html_path.name}: {e}"
-                    )
-                except Exception as e:
-                    logging.error(
-                        f"Unexpected error during screenshot for {html_path.name} ({type(e).__name__}): {e}"
-                    )
-            browser.close()
-    except PlaywrightError as e:
-        logging.error(f"Playwright context error: {e}. Screenshots may not have been generated.")
+        BeautifulSoup("<html></html>", "lxml")
+        return "lxml"
+    except FeatureNotFound:
+        print(f"{Colors.YELLOW}Note: 'lxml' parser not found. Falling back to the built-in 'html.parser'.\nFor better performance, run: {Colors.BOLD}python3 -m pip install lxml{Colors.ENDC}")
+        return "html.parser"
+HTML_PARSER = get_parser()
+
+class ChangeProposal:
+    """Stores all proposed changes for files and images."""
+    def __init__(self):
+        self.html_additions: Dict[Path, List[Dict]] = {}
+        self.html_updates: Dict[Path, List[Dict]] = {}
+        self.screenshot_tasks: Set[str] = set()
+        self.scanned_files = 0
+        self.unparseable_files = 0
+
+    def add_tag(self, file_path: Path, attrs: Dict, content: str, tag_name: str, content_attr: str):
+        if file_path not in self.html_additions: self.html_additions[file_path] = []
+        self.html_additions[file_path].append({"attrs": attrs, "content": content, "tag_name": tag_name, "content_attr": content_attr})
+
+    def update_tag(self, file_path: Path, attrs: Dict, old_val: str, new_val: str, tag_name: str, content_attr: str):
+        if file_path not in self.html_updates: self.html_updates[file_path] = []
+        self.html_updates[file_path].append({"attrs": attrs, "old": old_val, "new": new_val, "tag_name": tag_name, "content_attr": content_attr})
+
+    def add_screenshot_task(self, image_name: str): self.screenshot_tasks.add(image_name)
+    def file_has_changes(self, file_path: Path) -> bool: return file_path in self.html_additions or file_path in self.html_updates
+    def has_changes(self) -> bool: return bool(self.html_additions or self.html_updates or self.screenshot_tasks)
+
+    def print_report(self):
+        print(f"\n{Colors.BOLD}{Colors.HEADER}--- Meta Tag Analysis Report ---{Colors.ENDC}")
+        changed_file_count = len(set(self.html_additions.keys()) | set(self.html_updates.keys()))
+        perfect_files = self.scanned_files - changed_file_count - self.unparseable_files
+        print(f"Scanned {self.scanned_files} files: {Colors.GREEN}{perfect_files} perfect{Colors.ENDC}, {Colors.YELLOW}{changed_file_count} with issues{Colors.ENDC}, {Colors.RED}{self.unparseable_files} unparseable.{Colors.ENDC}")
+
+        if not self.has_changes(): return
+
+        all_changed_files = sorted(list(set(self.html_additions.keys()) | set(self.html_updates.keys())))
+
+        for file_path in all_changed_files:
+            print(f"\nšŸ“„ {Colors.BOLD}{file_path.name}{Colors.ENDC}")
+            for change in sorted(self.html_additions.get(file_path, []), key=lambda x: str(x['attrs'])):
+                tag_html = f'<{change["tag_name"]} {list(change["attrs"].keys())[0]}="{list(change["attrs"].values())[0]}" {change["content_attr"]}="{change["content"]}">'
+                print(f"  {Colors.GREEN}[+] ADD:    {Colors.ENDC}{tag_html}")
+            for change in sorted(self.html_updates.get(file_path, []), key=lambda x: str(x['attrs'])):
+                tag_html = f'<{change["tag_name"]} {list(change["attrs"].keys())[0]}="{list(change["attrs"].values())[0]}" {change["content_attr"]}="{change["new"]}">'
+                print(f"  {Colors.YELLOW}[~] UPDATE: {Colors.ENDC}{tag_html}")
+                print(f"            (from: {change['old']})")
+
+        if self.screenshot_tasks:
+            print(f"\n{Colors.UNDERLINE}{Colors.BLUE}Required Screenshot Generations:{Colors.ENDC}")
+            for image_name in sorted(list(self.screenshot_tasks)):
+                print(f"  {Colors.GREEN}[+] GENERATE: {Colors.ENDC}images/{image_name}")
+        print("-" * 30)
+
+def analyze_file(html_file: Path, base_url: str, proposal: ChangeProposal):
+    proposal.scanned_files += 1
+    try:
+        content = html_file.read_text(encoding="utf-8")
+        if not content.strip(): return
+        soup = BeautifulSoup(content, HTML_PARSER)
     except Exception as e:
-        logging.error(f"General error in screenshot generation ({type(e).__name__}): {e}")
-
-
-def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
-    """Updates a single HTML file based on its analysis results."""
-    html_path = file_analysis["path"]
+        logging.error(f"Could not parse {html_file.name}: {e}")
+        proposal.unparseable_files += 1
+        return
 
-    soup = file_analysis.get("soup")
-    if not soup:
-        try:
-            with open(html_path, "r", encoding="utf-8", errors="replace") as f:
-                content = f.read()
-                if not content.strip():
-                    logging.warning(f"Skipping update for empty file: {html_path.name}")
-                    return
-                soup = BeautifulSoup(content, "lxml")
-        except IOError as e:
-            logging.error(f"IOError re-reading {html_path.name} for update: {e}")
-            return
-        except Exception as e:
-            logging.error(f"Unexpected error re-reading {html_path.name} for update ({type(e).__name__}): {e}")
-            return
-
-
-    head = _ensure_head_exists(soup, html_path)
+    head = soup.head
     if not head:
+        logging.warning(f"No <head> tag in {html_file.name}. Skipping.")
+        proposal.unparseable_files += 1
         return
 
-    total_changes_made_to_file = False
-    file_stem = html_path.stem
-    expected_canonical_url = file_analysis.get("expected_canonical_url", f"{base_url}{html_path.name}")
-
-    default_title_text = _extract_content(soup, "h1", default_text=file_stem.replace('-', ' ').title())
-    default_description_text = DEFAULT_DESCRIPTION_PLACEHOLDER.format(file_stem.replace('-', ' '))
-
-    if file_analysis.get("og_image_needs_update") or file_analysis.get("twitter_image_needs_update"):
-        image_name_for_meta = file_analysis.get("expected_image_name", f"{file_stem}.png")
-        image_url_for_meta = f"images/{image_name_for_meta}"
-
-        if file_analysis.get("og_image_needs_update"):
-            _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:image"}, image_url_for_meta)
-            if changed: total_changes_made_to_file = True
-
-        if file_analysis.get("twitter_image_needs_update"):
-            _, changed = _get_or_create_meta_tag(soup, head, {"name": "twitter:image"}, image_url_for_meta)
-            if changed: total_changes_made_to_file = True
-
-    if file_analysis.get("og_title_missing"):
-        _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:title"}, default_title_text)
-        if changed: total_changes_made_to_file = True
-    if file_analysis.get("twitter_title_missing"):
-        _, changed = _get_or_create_meta_tag(soup, head, {"name": "twitter:title"}, default_title_text)
-        if changed: total_changes_made_to_file = True
-
-    if file_analysis.get("og_description_missing"):
-        _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:description"}, default_description_text)
-        if changed: total_changes_made_to_file = True
-    if file_analysis.get("twitter_description_missing"):
-        _, changed = _get_or_create_meta_tag(soup, head, {"name": "twitter:description"}, default_description_text)
-        if changed: total_changes_made_to_file = True
-    if file_analysis.get("meta_description_missing"):
-        _, changed = _get_or_create_meta_tag(soup, head, {"name": "description"}, default_description_text)
-        if changed: total_changes_made_to_file = True
-
-    if file_analysis.get("og_type_needs_update"):
-        _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:type"}, OG_TYPE_DEFAULT)
-        if changed: total_changes_made_to_file = True
-    if file_analysis.get("og_url_needs_update"):
-         _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:url"}, expected_canonical_url)
-         if changed: total_changes_made_to_file = True
-    if file_analysis.get("twitter_card_needs_update"):
-        _, changed = _get_or_create_meta_tag(soup, head, {"name": "twitter:card"}, TWITTER_CARD_DEFAULT)
-        if changed: total_changes_made_to_file = True
-
-    if file_analysis.get("canonical_needs_fix"):
-        _, changed = _get_or_create_link_tag(soup, head, {"rel": "canonical"}, expected_canonical_url)
-        if changed:
-            total_changes_made_to_file = True
-            logging.info(f"Updated canonical URL in {html_path.name} to: {expected_canonical_url}")
-
-    if total_changes_made_to_file:
-        try:
-            with open(html_path, "w", encoding="utf-8") as f:
-                f.write(str(soup.prettify())) # Using prettify for readable output
-            logging.info(f"Successfully updated meta tags for: {html_path.name}")
-        except IOError as e:
-            logging.error(f"IOError during final write of {html_path.name}: {e}")
-        except Exception as e:
-            logging.error(f"Unexpected error during final write of {html_path.name} ({type(e).__name__}): {e}")
-
-
-def process_directory(directory: str = ".", base_url: str = "https://cheatsheets.davidveksler.com/"):
-    dir_path = Path(directory).resolve()
-
-    if not dir_path.exists() or not dir_path.is_dir():
-        logging.error(
-            f"Error: Provided directory '{directory}' does not exist or is not a directory."
-        )
-        return
+    # --- Analysis Logic ---
+    file_stem = html_file.stem
+    images_dir = html_file.parent / "images"
 
-    try:
-        parsed_base_url = urlparse(base_url)
-    except ValueError as e:
-        logging.error(f"Error: Invalid base_url '{base_url}'. Could not parse. {e}")
-        return
+    # Determine the correct, final image URL to use for this page
+    final_image_url = f"images/{file_stem}.png"  # Default if no valid image is found
+    needs_screenshot = True
 
-    if not (parsed_base_url.scheme in ["http", "https"] and parsed_base_url.netloc):
-        logging.error(f"Error: Invalid base_url '{base_url}'. Must be a valid HTTP/HTTPS URL.")
-        return
-    if not base_url.endswith("/"):
-        base_url += "/"
-        logging.warning(f"Base URL did not end with '/', appended it: {base_url}")
-        parsed_base_url = urlparse(base_url)
+    # Check for an existing, valid og:image tag
+    og_image_tag = soup.find("meta", property="og:image")
+    if og_image_tag and og_image_tag.get("content"):
+        current_og_image_url = og_image_tag.get("content")
 
-    http_base_url_netloc = parsed_base_url.netloc
+        if "YOUR_IMAGE_URL_HERE" not in current_og_image_url:
+            try:
+                # Extract filename from a potential full URL or relative path
+                image_filename = Path(urlparse(current_og_image_url).path).name
+                if (images_dir / image_filename).exists():
+                    # A valid, existing image was found! Use it.
+                    final_image_url = f"images/{image_filename}"
+                    needs_screenshot = False # Don't generate a new one
+            except Exception as e:
+                logging.warning(f"Could not parse image path '{current_og_image_url}' in {html_file.name}: {e}")
 
-    images_dir = dir_path / "images"
-    logging.info(f"Processing directory: {dir_path}")
-    logging.info(f"Using base URL: {base_url}")
+    expected_canonical_url = f"{base_url}{html_file.name}"
 
-    all_html_files = sorted(list(dir_path.glob("*.html"))) # Sorted for deterministic output
-    if not all_html_files:
-        logging.info(f"No HTML files found in {dir_path}.")
+    # ADD-ONLY tags (Creative Content - will not overwrite)
+    if not soup.find("meta", property="og:title"):
+        title = soup.find("title")
+        if title and title.get_text(strip=True):
+            proposal.add_tag(html_file, {"property": "og:title"}, title.get_text(strip=True), "meta", "content")
+
+    # ADD or UPDATE tags (Technical Content)
+    technical_tags = {
+        "og:image": ({"property": "og:image"}, final_image_url, "meta", "content"),
+        "twitter:image": ({"name": "twitter:image"}, final_image_url, "meta", "content"),
+        "og:url": ({"property": "og:url"}, expected_canonical_url, "meta", "content"),
+        "canonical": ({"rel": "canonical"}, expected_canonical_url, "link", "href"),
+        "og:type": ({"property": "og:type"}, OG_TYPE_DEFAULT, "meta", "content"),
+        "twitter:card": ({"name": "twitter:card"}, TWITTER_CARD_DEFAULT, "meta", "content"),
+    }
+    for _, (attrs, expected_content, tag_name, content_attr) in technical_tags.items():
+        tag = soup.find(tag_name, attrs=attrs)
+        if not tag:
+            proposal.add_tag(html_file, attrs, expected_content, tag_name, content_attr)
+        elif tag.get(content_attr) != expected_content:
+            proposal.update_tag(html_file, attrs, tag.get(content_attr, "N/A"), expected_content, tag_name, content_attr)
+
+    # Add a screenshot task ONLY if we couldn't find a valid existing image
+    # AND the standardized target image doesn't exist either.
+    expected_image_path = images_dir / f"{file_stem}.png"
+    if needs_screenshot and not expected_image_path.exists():
+        proposal.add_screenshot_task(f"{file_stem}.png")
+
+def apply_changes(proposal: ChangeProposal, images_dir: Path):
+    if proposal.html_additions or proposal.html_updates:
+        print(f"\n{Colors.BLUE}Applying HTML changes...{Colors.ENDC}")
+        all_changed_files = sorted(list(set(proposal.html_additions.keys()) | set(proposal.html_updates.keys())))
+        for file_path in all_changed_files:
+            try:
+                soup = BeautifulSoup(file_path.read_text(encoding="utf-8"), HTML_PARSER)
+                head = soup.head
+                if not head: continue
+
+                updates_for_file = proposal.html_updates.get(file_path, [])
+                for update in updates_for_file:
+                    old_tag = soup.find(update['tag_name'], attrs=update['attrs'])
+                    if old_tag: old_tag.decompose()
+
+                all_tags_to_add = proposal.html_additions.get(file_path, []) + updates_for_file
+                for change in all_tags_to_add:
+                    new_tag = soup.new_tag(change['tag_name'], attrs=change['attrs'])
+                    new_tag[change['content_attr']] = change.get('new', change.get('content'))
+                    head.append(new_tag)
+
+                file_path.write_text(str(soup.prettify()), encoding="utf-8")
+                print(f"  {Colors.GREEN}Updated:{Colors.ENDC} {file_path.name}")
+            except Exception as e:
+                print(f"{Colors.RED}Error updating {file_path.name}: {e}{Colors.ENDC}")
+
+    if proposal.screenshot_tasks:
+        generate_screenshots(list(proposal.screenshot_tasks), images_dir)
+
+def generate_screenshots(image_names: List[str], images_dir: Path):
+    if not PLAYWRIGHT_AVAILABLE:
+        print(f"{Colors.RED}\nPlaywright is not installed. Cannot generate screenshots.{Colors.ENDC}")
+        print(f"{Colors.YELLOW}To enable, run: {Colors.BOLD}python3 -m pip install playwright && python3 -m playwright install{Colors.ENDC}")
         return
 
-    processed_files_info: List[Dict[str, Any]] = []
-    for html_file in all_html_files:
-        logging.debug(f"Analyzing {html_file.name}...")
-        analysis = analyze_html_file(html_file, base_url, http_base_url_netloc)
-        if analysis.get("error"):
-            logging.warning(f"Skipping {html_file.name} due to analysis error: {analysis['error']}")
-            if "soup" in analysis: del analysis["soup"]
-            continue
-
-        if analysis.get("needs_update") or analysis.get("needs_screenshot"):
-            processed_files_info.append(analysis)
-
-    if not processed_files_info:
-        logging.info("No files need processing after analysis.")
+    images_dir.mkdir(exist_ok=True)
+    print(f"\n{Colors.BLUE}Generating {len(image_names)} screenshots...{Colors.ENDC}")
+    with sync_playwright() as p:
+        try:
+            browser = p.chromium.launch(headless=True)
+            page = browser.new_page(viewport={"width": 1200, "height": 630})
+            for image_name in sorted(image_names):
+                html_path = (images_dir.parent / f"{Path(image_name).stem}.html").resolve()
+                if not html_path.exists():
+                    print(f"  {Colors.YELLOW}Warning:{Colors.ENDC} Could not find source file {html_path.name} to generate screenshot.")
+                    continue
+                try:
+                    page.goto(f"file://{html_path}", wait_until="networkidle")
+                    page.screenshot(path=images_dir / image_name, type="png")
+                    print(f"  {Colors.GREEN}Generated:{Colors.ENDC} {Path('images') / image_name}")
+                except PlaywrightError as e:
+                    print(f"  {Colors.RED}Error: {Colors.ENDC}Could not screenshot {html_path.name}: {e}")
+            browser.close()
+        except PlaywrightError as e:
+            print(f"{Colors.RED}Playwright Error: {e}. Ensure browsers are installed with: {Colors.BOLD}python3 -m playwright install{Colors.ENDC}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyzes and fixes meta tags in HTML files.", formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("directory", nargs="?", default=".", help="Directory to scan (default: current).")
+    parser.add_argument("--base-url", default="https://cheatsheets.davidveksler.com/", help="Base URL for canonical links.")
+    parser.add_argument("--apply", action="store_true", help="Apply proposed changes to files. Default is a dry-run report.")
+    args = parser.parse_args()
+
+    dir_path = Path(args.directory)
+    if not dir_path.is_dir():
+        print(f"{Colors.RED}Error: Directory '{args.directory}' not found.{Colors.ENDC}")
         return
 
-    files_needing_screenshots = [
-        res for res in processed_files_info if res.get("needs_screenshot")
-    ]
-    files_needing_html_updates = [
-        res for res in processed_files_info if res.get("needs_update")
-    ]
-
-    logging.info(f"Found {len(files_needing_screenshots)} files potentially needing screenshots.")
-    logging.info(f"Found {len(files_needing_html_updates)} files needing HTML meta tag updates.")
-
-    if files_needing_screenshots:
-        generate_screenshots(files_needing_screenshots, images_dir)
-
-    if files_needing_html_updates:
-        for file_analysis in files_needing_html_updates:
-            update_html_file_meta(file_analysis, base_url)
-
-    logging.info("Processing complete.")
+    proposal = ChangeProposal()
+    html_files = sorted(list(dir_path.glob("*.html")))
+    if not html_files:
+        print(f"{Colors.YELLOW}No HTML files found in '{dir_path.resolve()}' to analyze.{Colors.ENDC}")
+        return
 
-# Running the main function on our test directory
-process_directory(directory=".", base_url="https://test.com/")
+    for html_file in html_files:
+        analyze_file(html_file, args.base_url, proposal)
 
+    proposal.print_report()
+    if not proposal.has_changes():
+        return
 
-# Finally, print the contents of the modified files to verify the changes
-print("\n--- Verification ---")
-for f in sorted(Path(".").glob("*.html")):
-    print(f"\n--- Contents of {f.name} ---")
-    print(f.read_text())
+    if args.apply:
+        apply_changes(proposal, dir_path / "images")
+        print(f"\n{Colors.GREEN}Processing complete.{Colors.ENDC}")
+    else:
+        print(f"\n{Colors.YELLOW}This was a dry run. To apply these changes, run again with the {Colors.BOLD}--apply{Colors.YELLOW} flag.{Colors.ENDC}")
 
-# Check for generated images
-print("\n--- Generated Images ---")
-image_files = list(Path("./images").glob("*.png"))
-for img in image_files:
-    print(img.name)
-if not image_files:
-    print("No images found.")
+if __name__ == "__main__":
+    main()
\ No newline at end of file