Update generate-image-previews.py

D David Veksler Jun 7, 2025 10:41 PM · 1 year ago f2537c6446c4108d10c4d4c7aab77777111a849d
Parent: fbb5c51d8
1 file changed +31 −47

generate-image-previews.py +31 −47 ▰▰▰▰▰
Diff

diff --git a/generate-image-previews.py b/generate-image-previews.py
index fe22267..e7e4ba6 100644
--- a/generate-image-previews.py
+++ b/generate-image-previews.py
@@ -1,9 +1,12 @@
-An excellent suggestion. The current script only creates an image if the corresponding meta tag is missing. A more robust approach is to also handle cases where the tag exists but the referenced image file has been deleted or has a non-standard name.
+import sys
+import subprocess
 
-I will refine the script to ensure that for every HTML file, a corresponding standardized preview image (<filename>.png) exists, and the og:image and twitter:image meta tags correctly point to it.
+# Install dependencies
+subprocess.run([sys.executable, "-m", "pip", "install", "beautifulsoup4", "lxml", "playwright"], check=True)
+subprocess.run([sys.executable, "-m", "playwright", "install", "chromium"], check=True)
 
-Here is the refined generate-image-previews.py script:
 
+# Now, re-declare and run the full script
 import logging
 from pathlib import Path
 from typing import List, Tuple, Dict, Any, Optional
@@ -129,7 +132,7 @@ def analyze_html_file(
                 logging.warning(f"File {html_file.name} is empty or whitespace only. Skipping analysis.")
                 analysis["error"] = "Empty file"
                 return analysis
-            soup = BeautifulSoup(content, "html.parser")
+            soup = BeautifulSoup(content, "lxml")
 
         analysis["soup"] = soup # Store soup for use in update function if needed for extraction
 
@@ -260,7 +263,6 @@ def generate_screenshots(
 
             for file_info in files_for_screenshot:
                 html_path = file_info["path"]
-                # Ensure expected_image_name is present; could happen if only one image tag was missing initially
                 image_name = file_info.get("expected_image_name", f"{html_path.stem}.png")
                 image_path = images_dir / image_name
                 try:
@@ -293,16 +295,15 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
     """Updates a single HTML file based on its analysis results."""
     html_path = file_analysis["path"]
 
-    # Retrieve the soup object from analysis if it's clean, otherwise re-read
     soup = file_analysis.get("soup")
-    if not soup: # If soup wasn't stored or an error occurred during analysis parsing
+    if not soup:
         try:
             with open(html_path, "r", encoding="utf-8", errors="replace") as f:
                 content = f.read()
-                if not content.strip(): # Should have been caught by analysis, but double check
+                if not content.strip():
                     logging.warning(f"Skipping update for empty file: {html_path.name}")
                     return
-                soup = BeautifulSoup(content, "html.parser")
+                soup = BeautifulSoup(content, "lxml")
         except IOError as e:
             logging.error(f"IOError re-reading {html_path.name} for update: {e}")
             return
@@ -319,13 +320,9 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
     file_stem = html_path.stem
     expected_canonical_url = file_analysis.get("expected_canonical_url", f"{base_url}{html_path.name}")
 
-    # --- Default Content Derivation ---
-    # (Do this once, used if tags are missing)
     default_title_text = _extract_content(soup, "h1", default_text=file_stem.replace('-', ' ').title())
     default_description_text = DEFAULT_DESCRIPTION_PLACEHOLDER.format(file_stem.replace('-', ' '))
 
-
-    # --- Social Media Image Tags (Add if missing or update if incorrect) ---
     if file_analysis.get("og_image_needs_update") or file_analysis.get("twitter_image_needs_update"):
         image_name_for_meta = file_analysis.get("expected_image_name", f"{file_stem}.png")
         image_url_for_meta = f"images/{image_name_for_meta}"
@@ -338,7 +335,6 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
             _, changed = _get_or_create_meta_tag(soup, head, {"name": "twitter:image"}, image_url_for_meta)
             if changed: total_changes_made_to_file = True
 
-    # --- Title Tags (Add if missing, preserve content if exists) ---
     if file_analysis.get("og_title_missing"):
         _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:title"}, default_title_text)
         if changed: total_changes_made_to_file = True
@@ -346,18 +342,16 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
         _, changed = _get_or_create_meta_tag(soup, head, {"name": "twitter:title"}, default_title_text)
         if changed: total_changes_made_to_file = True
 
-    # --- Description Tags (Add if missing, preserve content if exists) ---
     if file_analysis.get("og_description_missing"):
         _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:description"}, default_description_text)
         if changed: total_changes_made_to_file = True
     if file_analysis.get("twitter_description_missing"):
         _, changed = _get_or_create_meta_tag(soup, head, {"name": "twitter:description"}, default_description_text)
         if changed: total_changes_made_to_file = True
-    if file_analysis.get("meta_description_missing"): # Standard meta description
+    if file_analysis.get("meta_description_missing"):
         _, changed = _get_or_create_meta_tag(soup, head, {"name": "description"}, default_description_text)
         if changed: total_changes_made_to_file = True
 
-    # --- Other Social Tags (Add if missing, or update content if present and incorrect) ---
     if file_analysis.get("og_type_needs_update"):
         _, changed = _get_or_create_meta_tag(soup, head, {"property": "og:type"}, OG_TYPE_DEFAULT)
         if changed: total_changes_made_to_file = True
@@ -368,7 +362,6 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
         _, changed = _get_or_create_meta_tag(soup, head, {"name": "twitter:card"}, TWITTER_CARD_DEFAULT)
         if changed: total_changes_made_to_file = True
 
-    # --- Canonical URL (Add if missing, or update href if present and incorrect) ---
     if file_analysis.get("canonical_needs_fix"):
         _, changed = _get_or_create_link_tag(soup, head, {"rel": "canonical"}, expected_canonical_url)
         if changed:
@@ -378,7 +371,7 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
     if total_changes_made_to_file:
         try:
             with open(html_path, "w", encoding="utf-8") as f:
-                f.write(str(soup))
+                f.write(str(soup.prettify())) # Using prettify for readable output
             logging.info(f"Successfully updated meta tags for: {html_path.name}")
         except IOError as e:
             logging.error(f"IOError during final write of {html_path.name}: {e}")
@@ -387,9 +380,6 @@ def update_html_file_meta(file_analysis: Dict[str, Any], base_url: str):
 
 
 def process_directory(directory: str = ".", base_url: str = "https://cheatsheets.davidveksler.com/"):
-    """
-    Main pipeline: Validates inputs, analyzes HTML files, generates screenshots, and updates HTML files.
-    """
     dir_path = Path(directory).resolve()
 
     if not dir_path.exists() or not dir_path.is_dir():
@@ -414,12 +404,11 @@ def process_directory(directory: str = ".", base_url: str = "https://cheatsheets
 
     http_base_url_netloc = parsed_base_url.netloc
 
-
     images_dir = dir_path / "images"
     logging.info(f"Processing directory: {dir_path}")
     logging.info(f"Using base URL: {base_url}")
 
-    all_html_files = list(dir_path.glob("*.html"))
+    all_html_files = sorted(list(dir_path.glob("*.html"))) # Sorted for deterministic output
     if not all_html_files:
         logging.info(f"No HTML files found in {dir_path}.")
         return
@@ -430,13 +419,12 @@ def process_directory(directory: str = ".", base_url: str = "https://cheatsheets
         analysis = analyze_html_file(html_file, base_url, http_base_url_netloc)
         if analysis.get("error"):
             logging.warning(f"Skipping {html_file.name} due to analysis error: {analysis['error']}")
-            if "soup" in analysis: del analysis["soup"] # Don't keep potentially bad soup
-            continue # Skip files with analysis errors for further processing
+            if "soup" in analysis: del analysis["soup"]
+            continue
 
         if analysis.get("needs_update") or analysis.get("needs_screenshot"):
             processed_files_info.append(analysis)
 
-
     if not processed_files_info:
         logging.info("No files need processing after analysis.")
         return
@@ -444,7 +432,6 @@ def process_directory(directory: str = ".", base_url: str = "https://cheatsheets
     files_needing_screenshots = [
         res for res in processed_files_info if res.get("needs_screenshot")
     ]
-    # Files that need HTML modification
     files_needing_html_updates = [
         res for res in processed_files_info if res.get("needs_update")
     ]
@@ -461,23 +448,20 @@ def process_directory(directory: str = ".", base_url: str = "https://cheatsheets
 
     logging.info("Processing complete.")
 
+# Running the main function on our test directory
+process_directory(directory=".", base_url="https://test.com/")
+
+
+# Finally, print the contents of the modified files to verify the changes
+print("\n--- Verification ---")
+for f in sorted(Path(".").glob("*.html")):
+    print(f"\n--- Contents of {f.name} ---")
+    print(f.read_text())
 
-if __name__ == "__main__":
-    # To test this specific change, you might create files like:
-    # test_dir = Path("test_html_preserve")
-    # test_dir.mkdir(exist_ok=True)
-    # (test_dir / "test_preserve.html").write_text(
-    #     "<html><head><title>Original Title</title>"
-    #     "<meta property='og:title' content='My Custom OG Title'>"
-    #     "<meta name='twitter:title' content='My Custom Twitter Title'>"
-    #     "<meta property='og:description' content='My Custom OG Desc'>"
-    #     "<meta name='twitter:description' content='My Custom Twitter Desc'>"
-    #     "<meta name='description' content='My Custom Meta Desc'>"
-    #     # Missing og:type, og:url, twitter:card, canonical, images - these should be added/updated
-    #     "</head><body><h1>Page H1 For Defaults</h1></body></html>"
-    # )
-    # (test_dir / "test_add_new.html").write_text(
-    #    "<html><head><title>New Page</title></head><body><h1>H1 For New Page</h1></body></html>"
-    # )
-    # process_directory(str(test_dir), "https://example.com/docs/")
-    process_directory()
+# Check for generated images
+print("\n--- Generated Images ---")
+image_files = list(Path("./images").glob("*.png"))
+for img in image_files:
+    print(img.name)
+if not image_files:
+    print("No images found.")