SET CANONICAL URLS

D David Veksler · 1 year ago 525d87527efa6dd57928d55b3a1560714c24e5de
Parent: bcf98770f

1 file changed +104 −15

Diff

diff --git a/generate-image-previews.py b/generate-image-previews.py
index 36b83d8..6e2a6a4 100644
--- a/generate-image-previews.py
+++ b/generate-image-previews.py
@@ -6,15 +6,17 @@ from typing import List, Tuple
 
 logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
 
-def analyze_html_files(directory: Path) -> List[Tuple[Path, str]]:
-    """Return (html_path, image_filename) for files missing og/twitter images."""
+def analyze_html_files(directory: Path, base_url: str = "https://cheatsheets.davidveksler.com/") -> Tuple[List[Tuple[Path, str]], List[Path]]:
+    """Return files missing og/twitter images and files needing canonical URL fixes."""
     missing_images = []
+    needs_canonical_fix = []
     
     for html_file in directory.glob('*.html'):
         try:
             with open(html_file, 'r', encoding='utf-8', errors='ignore') as f:
                 soup = BeautifulSoup(f.read(), 'html.parser')
             
+            # Check for missing social media images
             has_og = soup.find('meta', property='og:image') is not None
             has_twitter = soup.find('meta', attrs={'name': 'twitter:image'}) is not None
             
@@ -22,14 +24,36 @@ def analyze_html_files(directory: Path) -> List[Tuple[Path, str]]:
                 image_name = f"{html_file.stem}.png"
                 missing_images.append((html_file, image_name))
                 logging.info(f"Missing images: {html_file.name}")
+            
+            # Check canonical URL
+            canonical_link = soup.find('link', rel='canonical')
+            expected_canonical = f"{base_url}{html_file.name}"
+            needs_fix = False
+            
+            if not canonical_link:
+                needs_fix = True
+                logging.info(f"Missing canonical URL: {html_file.name}")
+            else:
+                current_href = canonical_link.get('href', '')
+                # Check if needs http->https fix or incorrect URL
+                if (current_href.startswith('http://cheatsheets.davidveksler.com/') or 
+                    current_href != expected_canonical):
+                    needs_fix = True
+                    logging.info(f"Incorrect canonical URL: {html_file.name} ({current_href})")
+            
+            if needs_fix:
+                needs_canonical_fix.append(html_file)
         
         except Exception as e:
             logging.error(f"Error parsing {html_file.name}: {e}")
     
-    return missing_images
+    return missing_images, needs_canonical_fix
 
 def generate_screenshots(files_to_process: List[Tuple[Path, str]], images_dir: Path):
     """Generate screenshots for HTML files."""
+    if not files_to_process:
+        return
+        
     images_dir.mkdir(exist_ok=True)
     
     with sync_playwright() as p:
@@ -49,7 +73,7 @@ def generate_screenshots(files_to_process: List[Tuple[Path, str]], images_dir: P
                 """)
                 
                 image_path = images_dir / image_name
-                page.screenshot(path=image_path, type='png')  # Removed quality parameter
+                page.screenshot(path=image_path, type='png')
                 logging.info(f"Generated: {image_name}")
                 
             except Exception as e:
@@ -57,9 +81,13 @@ def generate_screenshots(files_to_process: List[Tuple[Path, str]], images_dir: P
         
         browser.close()
 
-def update_html_files(files_to_process: List[Tuple[Path, str]]):
-    """Add og:image and twitter:image meta tags to HTML files."""
-    for html_path, image_name in files_to_process:
+def update_html_files(files_with_missing_images: List[Tuple[Path, str]], 
+                     files_needing_canonical_fix: List[Path], 
+                     base_url: str = "https://cheatsheets.davidveksler.com/"):
+    """Add og:image, twitter:image meta tags and fix canonical URLs."""
+    
+    # Process files missing social media images
+    for html_path, image_name in files_with_missing_images:
         try:
             with open(html_path, 'r', encoding='utf-8', errors='ignore') as f:
                 soup = BeautifulSoup(f.read(), 'html.parser')
@@ -88,28 +116,89 @@ def update_html_files(files_to_process: List[Tuple[Path, str]]):
             with open(html_path, 'w', encoding='utf-8') as f:
                 f.write(str(soup))
             
-            logging.info(f"Updated: {html_path.name}")
+            logging.info(f"Updated social media tags: {html_path.name}")
+            
+        except Exception as e:
+            logging.error(f"Social media update failed for {html_path.name}: {e}")
+    
+    # Process files needing canonical URL fixes
+    for html_path in files_needing_canonical_fix:
+        try:
+            with open(html_path, 'r', encoding='utf-8', errors='ignore') as f:
+                soup = BeautifulSoup(f.read(), 'html.parser')
+            
+            # Ensure head exists
+            if not soup.head:
+                if soup.html:
+                    soup.html.insert(0, soup.new_tag('head'))
+                else:
+                    logging.warning(f"No html tag in {html_path.name}, skipping")
+                    continue
+            
+            expected_canonical = f"{base_url}{html_path.name}"
+            
+            # Handle canonical URL
+            canonical_link = soup.find('link', rel='canonical')
+            if canonical_link:
+                # Fix existing canonical
+                current_href = canonical_link.get('href', '')
+                
+                # Fix http to https
+                if current_href.startswith('http://cheatsheets.davidveksler.com/'):
+                    current_href = current_href.replace('http://cheatsheets.davidveksler.com/', 'https://cheatsheets.davidveksler.com/')
+                
+                # Update if different from expected
+                if current_href != expected_canonical:
+                    canonical_link['href'] = expected_canonical
+                    logging.info(f"Fixed canonical URL in {html_path.name}: {expected_canonical}")
+            else:
+                # Add missing canonical
+                canonical_tag = soup.new_tag('link', rel='canonical', href=expected_canonical)
+                # Insert canonical near the top of head, after charset and viewport
+                insert_position = 0
+                for i, child in enumerate(soup.head.children):
+                    if hasattr(child, 'name') and child.name == 'meta':
+                        if child.get('charset') or child.get('name') == 'viewport':
+                            insert_position = i + 1
+                
+                if insert_position < len(list(soup.head.children)):
+                    soup.head.insert(insert_position, canonical_tag)
+                else:
+                    soup.head.append(canonical_tag)
+                    
+                logging.info(f"Added canonical URL to {html_path.name}: {expected_canonical}")
+            
+            # Write back with minimal formatting changes
+            with open(html_path, 'w', encoding='utf-8') as f:
+                f.write(str(soup))
+            
+            logging.info(f"Updated canonical URL: {html_path.name}")
             
         except Exception as e:
-            logging.error(f"Update failed for {html_path.name}: {e}")
+            logging.error(f"Canonical URL update failed for {html_path.name}: {e}")
 
-def process_directory(directory: str = '.'):
+def process_directory(directory: str = '.', base_url: str = "https://cheatsheets.davidveksler.com/"):
     """Main pipeline: analyze, screenshot, update."""
     dir_path = Path(directory).resolve()
     images_dir = dir_path / 'images'
     
     logging.info(f"Processing directory: {dir_path}")
     
-    files_to_process = analyze_html_files(dir_path)
+    files_with_missing_images, files_needing_canonical_fix = analyze_html_files(dir_path, base_url)
     
-    if not files_to_process:
+    if not files_with_missing_images and not files_needing_canonical_fix:
         logging.info("No files need processing")
         return
     
-    logging.info(f"Found {len(files_to_process)} files to process")
+    logging.info(f"Found {len(files_with_missing_images)} files needing screenshots")
+    logging.info(f"Found {len(files_needing_canonical_fix)} files needing canonical URL fixes")
+    
+    # Generate screenshots only for files missing social media images
+    if files_with_missing_images:
+        generate_screenshots(files_with_missing_images, images_dir)
     
-    generate_screenshots(files_to_process, images_dir)
-    update_html_files(files_to_process)
+    # Update HTML files (both social media tags and canonical URLs)
+    update_html_files(files_with_missing_images, files_needing_canonical_fix, base_url)
     
     logging.info("Processing complete")