Update generate-image-previews.py

D David Veksler · 1 year ago fbb5c51d85d9cec6681a0fc63af9840d136a6fac
Parent: 0007a0032

1 file changed +83 −58

Diff

diff --git a/generate-image-previews.py b/generate-image-previews.py
index ed2a99d..fe22267 100644
--- a/generate-image-previews.py
+++ b/generate-image-previews.py
@@ -1,3 +1,9 @@
+An excellent suggestion. The current script only creates an image if the corresponding meta tag is missing. A more robust approach is to also handle cases where the tag exists but the referenced image file has been deleted or has a non-standard name.
+
+I will refine the script to ensure that for every HTML file, a corresponding standardized preview image (<filename>.png) exists, and the og:image and twitter:image meta tags correctly point to it.
+
+Here is the refined generate-image-previews.py script:
+
 import logging
 from pathlib import Path
 from typing import List, Tuple, Dict, Any, Optional
@@ -78,8 +84,8 @@ def _get_or_create_link_tag(
             if hasattr(child, 'name') and child.name == 'meta':
                 if child.get('charset') or child.get('name') == 'viewport':
                     insert_position = i + 1
-        
-        if not head.contents: 
+
+        if not head.contents:
             head.append(tag)
         elif insert_position < len(head.contents):
             head.insert(insert_position, tag)
@@ -113,6 +119,7 @@ def analyze_html_file(
     analysis: Dict[str, Any] = {"path": html_file, "needs_update": False}
     file_stem = html_file.stem
     expected_image_name = f"{file_stem}.png"
+    expected_image_url = f"images/{expected_image_name}"
     expected_canonical_url = f"{base_url}{html_file.name}"
 
     try:
@@ -127,68 +134,84 @@ def analyze_html_file(
         analysis["soup"] = soup # Store soup for use in update function if needed for extraction
 
         # --- Find existing tags ---
-        analysis["og_image_tag"] = soup.find("meta", property="og:image")
-        analysis["twitter_image_tag"] = soup.find("meta", attrs={"name": "twitter:image"})
-        analysis["og_title_tag"] = soup.find("meta", property="og:title")
-        analysis["twitter_title_tag"] = soup.find("meta", attrs={"name": "twitter:title"})
-        analysis["og_description_tag"] = soup.find("meta", property="og:description")
-        analysis["twitter_description_tag"] = soup.find("meta", attrs={"name": "twitter:description"})
-        analysis["meta_description_tag"] = soup.find("meta", attrs={"name": "description"})
-        analysis["og_type_tag"] = soup.find("meta", property="og:type")
-        analysis["og_url_tag"] = soup.find("meta", property="og:url")
-        analysis["twitter_card_tag"] = soup.find("meta", attrs={"name": "twitter:card"})
-        analysis["canonical_link_tag"] = soup.find("link", rel="canonical")
+        og_image_tag = soup.find("meta", property="og:image")
+        twitter_image_tag = soup.find("meta", attrs={"name": "twitter:image"})
+        og_title_tag = soup.find("meta", property="og:title")
+        twitter_title_tag = soup.find("meta", attrs={"name": "twitter:title"})
+        og_description_tag = soup.find("meta", property="og:description")
+        twitter_description_tag = soup.find("meta", attrs={"name": "twitter:description"})
+        meta_description_tag = soup.find("meta", attrs={"name": "description"})
+        og_type_tag = soup.find("meta", property="og:type")
+        og_url_tag = soup.find("meta", property="og:url")
+        twitter_card_tag = soup.find("meta", attrs={"name": "twitter:card"})
+        canonical_link_tag = soup.find("link", rel="canonical")
 
         # --- Determine if updates are needed ---
 
-        # 1. Social Media Images (Add if missing)
-        if not analysis["og_image_tag"] or not analysis["twitter_image_tag"]:
-            analysis["expected_image_name"] = expected_image_name
-            analysis["needs_screenshot"] = True # Screenshot needed if either is missing
+        # 1. Social Media Image Tags & File
+        dir_path = html_file.parent
+        expected_image_path = dir_path / expected_image_url
+
+        if not og_image_tag or og_image_tag.get("content") != expected_image_url:
+            analysis["og_image_needs_update"] = True
+            analysis["needs_update"] = True
+            logging.info(f"og:image tag is missing or incorrect for: {html_file.name}")
+
+        if not twitter_image_tag or twitter_image_tag.get("content") != expected_image_url:
+            analysis["twitter_image_needs_update"] = True
             analysis["needs_update"] = True
-            if not analysis["og_image_tag"]: analysis["og_image_missing"] = True
-            if not analysis["twitter_image_tag"]: analysis["twitter_image_missing"] = True
-            logging.info(f"Identified missing social image tags for: {html_file.name}")
+            logging.info(f"twitter:image tag is missing or incorrect for: {html_file.name}")
+
+        # A screenshot is required if the target image file doesn't exist on disk,
+        # or if we are about to update the tags to point to it.
+        if not expected_image_path.exists() or analysis.get("og_image_needs_update") or analysis.get("twitter_image_needs_update"):
+            analysis["needs_screenshot"] = True
+            analysis["expected_image_name"] = expected_image_name
+            if not expected_image_path.exists():
+                logging.info(f"Image file '{expected_image_name}' is missing for {html_file.name}, will generate.")
+            else: # Tags are being updated, so we'll refresh the screenshot to be safe.
+                logging.info(f"Image tags for {html_file.name} are incorrect, will generate fresh screenshot.")
+
 
         # 2. Title Tags (Add if missing)
-        if not analysis["og_title_tag"]:
+        if not og_title_tag:
             analysis["og_title_missing"] = True
             analysis["needs_update"] = True
-        if not analysis["twitter_title_tag"]:
+        if not twitter_title_tag:
             analysis["twitter_title_missing"] = True
             analysis["needs_update"] = True
-        
+
         # 3. Description Tags (Add if missing)
-        if not analysis["og_description_tag"]:
+        if not og_description_tag:
             analysis["og_description_missing"] = True
             analysis["needs_update"] = True
-        if not analysis["twitter_description_tag"]:
+        if not twitter_description_tag:
             analysis["twitter_description_missing"] = True
             analysis["needs_update"] = True
-        if not analysis["meta_description_tag"]:
+        if not meta_description_tag:
             analysis["meta_description_missing"] = True
             analysis["needs_update"] = True
 
         # 4. Other Social Tags (Add if missing, or update content if present and incorrect)
-        if not analysis["og_type_tag"] or analysis["og_type_tag"].get("content") != OG_TYPE_DEFAULT:
+        if not og_type_tag or og_type_tag.get("content") != OG_TYPE_DEFAULT:
             analysis["og_type_needs_update"] = True
             analysis["needs_update"] = True
-        if not analysis["og_url_tag"] or analysis["og_url_tag"].get("content") != expected_canonical_url:
+        if not og_url_tag or og_url_tag.get("content") != expected_canonical_url:
             analysis["og_url_needs_update"] = True
             analysis["needs_update"] = True
-        if not analysis["twitter_card_tag"] or analysis["twitter_card_tag"].get("content") != TWITTER_CARD_DEFAULT:
+        if not twitter_card_tag or twitter_card_tag.get("content") != TWITTER_CARD_DEFAULT:
             analysis["twitter_card_needs_update"] = True
             analysis["needs_update"] = True
-            
+
         # 5. Canonical URL (Add if missing, or update href if present and incorrect)
-        current_canonical_href = analysis["canonical_link_tag"].get("href", "") if analysis["canonical_link_tag"] else ""
+        current_canonical_href = canonical_link_tag.get("href", "") if canonical_link_tag else ""
         needs_canonical_fix = False
-        if not analysis["canonical_link_tag"]:
+        if not canonical_link_tag:
             needs_canonical_fix = True
             logging.info(f"Missing canonical URL: {html_file.name}")
         else:
             parsed_current_href = urlparse(current_canonical_href)
-            is_http_on_base_domain = (parsed_current_href.scheme == "http" and 
+            is_http_on_base_domain = (parsed_current_href.scheme == "http" and
                                       parsed_current_href.netloc.lower() == http_base_url_netloc.lower())
             is_different_url = (current_canonical_href != expected_canonical_url)
             if is_http_on_base_domain or is_different_url:
@@ -204,11 +227,11 @@ def analyze_html_file(
     except IOError as e:
         logging.error(f"IOError parsing {html_file.name}: {e}")
         analysis["error"] = str(e)
-    except Exception as e: 
+    except Exception as e:
         logging.error(f"Unexpected error parsing {html_file.name} ({type(e).__name__}): {e}")
         analysis["error"] = str(e)
         if "soup" in analysis: del analysis["soup"] # Don't pass potentially corrupt soup
-        
+
     return analysis
 
 def generate_screenshots(
@@ -228,7 +251,7 @@ def generate_screenshots(
                 browser = p.chromium.launch(headless=True)
             except PlaywrightError as e:
                 logging.error(f"Failed to launch Chromium. Ensure Playwright browsers are installed ('playwright install chromium'). Error: {e}")
-                return 
+                return
 
             page = browser.new_page(viewport={"width": 1200, "height": 630})
             page.set_extra_http_headers(
@@ -238,7 +261,7 @@ def generate_screenshots(
             for file_info in files_for_screenshot:
                 html_path = file_info["path"]
                 # Ensure expected_image_name is present; could happen if only one image tag was missing initially
-                image_name = file_info.get("expected_image_name", f"{html_path.stem}.png") 
+                image_name = file_info.get("expected_image_name", f"{html_path.stem}.png")
                 image_path = images_dir / image_name
                 try:
                     page.goto(f"file://{html_path.resolve()}", wait_until="networkidle")
@@ -255,12 +278,12 @@ def generate_screenshots(
                     logging.error(
                         f"Playwright screenshot failed for {html_path.name}: {e}"
                     )
-                except Exception as e: 
+                except Exception as e:
                     logging.error(
                         f"Unexpected error during screenshot for {html_path.name} ({type(e).__name__}): {e}"
                     )
             browser.close()
-    except PlaywrightError as e: 
+    except PlaywrightError as e:
         logging.error(f"Playwright context error: {e}. Screenshots may not have been generated.")
     except Exception as e:
         logging.error(f"General error in screenshot generation ({type(e).__name__}): {e}")
@@ -269,7 +292,7 @@ def generate_screenshots(
 def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
     """Updates a single HTML file based on its analysis results."""
     html_path = file_analysis["path"]
-    
+
     # Retrieve the soup object from analysis if it's clean, otherwise re-read
     soup = file_analysis.get("soup")
     if not soup: # If soup wasn't stored or an error occurred during analysis parsing
@@ -289,7 +312,7 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
 
 
     head = _ensure_head_exists(soup, html_path)
-    if not head: 
+    if not head:
         return
 
     total_changes_made_to_file = False
@@ -302,17 +325,19 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
     default_description_text = DEFAULT_DESCRIPTION_PLACEHOLDER.format(file_stem.replace('-', ' '))
 
 
-    # --- Social Media Image Tags (Add if missing) ---
-    if file_analysis.get("og_image_missing") or file_analysis.get("twitter_image_missing"):
+    # --- Social Media Image Tags (Add if missing or update if incorrect) ---
+    if file_analysis.get("og_image_needs_update") or file_analysis.get("twitter_image_needs_update"):
         image_name_for_meta = file_analysis.get("expected_image_name", f"{file_stem}.png")
         image_url_for_meta = f"images/{image_name_for_meta}"
-        if file_analysis.get("og_image_missing"):
+
+        if file_analysis.get("og_image_needs_update"):
             _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:image"}, image_url_for_meta)
             if changed: total_changes_made_to_file = True
-        if file_analysis.get("twitter_image_missing"):
+
+        if file_analysis.get("twitter_image_needs_update"):
             _, changed = _get_or_create_meta_tag(soup, head, {"name": "twitter:image"}, image_url_for_meta)
             if changed: total_changes_made_to_file = True
-    
+
     # --- Title Tags (Add if missing, preserve content if exists) ---
     if file_analysis.get("og_title_missing"):
         _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:title"}, default_title_text)
@@ -331,7 +356,7 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
     if file_analysis.get("meta_description_missing"): # Standard meta description
         _, changed = _get_or_create_meta_tag(soup, head, {"name": "description"}, default_description_text)
         if changed: total_changes_made_to_file = True
-        
+
     # --- Other Social Tags (Add if missing, or update content if present and incorrect) ---
     if file_analysis.get("og_type_needs_update"):
         _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:type"}, OG_TYPE_DEFAULT)
@@ -346,14 +371,14 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
     # --- Canonical URL (Add if missing, or update href if present and incorrect) ---
     if file_analysis.get("canonical_needs_fix"):
         _, changed = _get_or_create_link_tag(soup, head, {"rel": "canonical"}, expected_canonical_url)
-        if changed: 
+        if changed:
             total_changes_made_to_file = True
             logging.info(f"Updated canonical URL in {html_path.name} to: {expected_canonical_url}")
-        
+
     if total_changes_made_to_file:
         try:
             with open(html_path, "w", encoding="utf-8") as f:
-                f.write(str(soup)) 
+                f.write(str(soup))
             logging.info(f"Successfully updated meta tags for: {html_path.name}")
         except IOError as e:
             logging.error(f"IOError during final write of {html_path.name}: {e}")
@@ -372,7 +397,7 @@ def process_directory(directory: str = ".", base_url: str = "https://cheatsheets
             f"Error: Provided directory '{directory}' does not exist or is not a directory."
         )
         return
-    
+
     try:
         parsed_base_url = urlparse(base_url)
     except ValueError as e:
@@ -385,7 +410,7 @@ def process_directory(directory: str = ".", base_url: str = "https://cheatsheets
     if not base_url.endswith("/"):
         base_url += "/"
         logging.warning(f"Base URL did not end with '/', appended it: {base_url}")
-        parsed_base_url = urlparse(base_url) 
+        parsed_base_url = urlparse(base_url)
 
     http_base_url_netloc = parsed_base_url.netloc
 
@@ -407,7 +432,7 @@ def process_directory(directory: str = ".", base_url: str = "https://cheatsheets
             logging.warning(f"Skipping {html_file.name} due to analysis error: {analysis['error']}")
             if "soup" in analysis: del analysis["soup"] # Don't keep potentially bad soup
             continue # Skip files with analysis errors for further processing
-        
+
         if analysis.get("needs_update") or analysis.get("needs_screenshot"):
             processed_files_info.append(analysis)
 
@@ -422,8 +447,8 @@ def process_directory(directory: str = ".", base_url: str = "https://cheatsheets
     # Files that need HTML modification
     files_needing_html_updates = [
         res for res in processed_files_info if res.get("needs_update")
-    ] 
-    
+    ]
+
     logging.info(f"Found {len(files_needing_screenshots)} files potentially needing screenshots.")
     logging.info(f"Found {len(files_needing_html_updates)} files needing HTML meta tag updates.")
 
@@ -432,8 +457,8 @@ def process_directory(directory: str = ".", base_url: str = "https://cheatsheets
 
     if files_needing_html_updates:
         for file_analysis in files_needing_html_updates:
-            update_html_file_meta(file_analysis, base_url) 
-    
+            update_html_file_meta(file_analysis, base_url)
+
     logging.info("Processing complete.")
 
 
@@ -455,4 +480,4 @@ if __name__ == "__main__":
     #    "<html><head><title>New Page</title></head><body><h1>H1 For New Page</h1></body></html>"
     # )
     # process_directory(str(test_dir), "https://example.com/docs/")
-    process_directory()
\ No newline at end of file
+    process_directory()