SET CANONICAL URLS
· 1 year ago
525d87527efa6dd57928d55b3a1560714c24e5de
Parent:
bcf98770f
1 file changed +104 −15
- generate-image-previews.py +104 −15
Diff
--- a/generate-image-previews.py +++ b/generate-image-previews.py @@ -6,15 +6,17 @@ from typing import List, Tuple logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') -def analyze_html_files(directory: Path) -> List[Tuple[Path, str]]: - """Return (html_path, image_filename) for files missing og/twitter images.""" +def analyze_html_files(directory: Path, base_url: str = "https://cheatsheets.davidveksler.com/") -> Tuple[List[Tuple[Path, str]], List[Path]]: + """Return files missing og/twitter images and files needing canonical URL fixes.""" missing_images = [] + needs_canonical_fix = [] for html_file in directory.glob('*.html'): try: with open(html_file, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f.read(), 'html.parser') + # Check for missing social media images has_og = soup.find('meta', property='og:image') is not None has_twitter = soup.find('meta', attrs={'name': 'twitter:image'}) is not None @@ -22,14 +24,36 @@ def analyze_html_files(directory: Path) -> List[Tuple[Path, str]]: image_name = f"{html_file.stem}.png" missing_images.append((html_file, image_name)) logging.info(f"Missing images: {html_file.name}") + + # Check canonical URL + canonical_link = soup.find('link', rel='canonical') + expected_canonical = f"{base_url}{html_file.name}" + needs_fix = False + + if not canonical_link: + needs_fix = True + logging.info(f"Missing canonical URL: {html_file.name}") + else: + current_href = canonical_link.get('href', '') + # Check if needs http->https fix or incorrect URL + if (current_href.startswith('http://cheatsheets.davidveksler.com/') or + current_href != expected_canonical): + needs_fix = True + logging.info(f"Incorrect canonical URL: {html_file.name} ({current_href})") + + if needs_fix: + needs_canonical_fix.append(html_file) except Exception as e: logging.error(f"Error parsing {html_file.name}: {e}") - return missing_images + return missing_images, needs_canonical_fix def generate_screenshots(files_to_process: List[Tuple[Path, str]], images_dir: Path): """Generate screenshots for HTML files.""" + if not files_to_process: + return + images_dir.mkdir(exist_ok=True) with sync_playwright() as p: @@ -49,7 +73,7 @@ def generate_screenshots(files_to_process: List[Tuple[Path, str]], images_dir: P """) image_path = images_dir / image_name - page.screenshot(path=image_path, type='png') # Removed quality parameter + page.screenshot(path=image_path, type='png') logging.info(f"Generated: {image_name}") except Exception as e: @@ -57,9 +81,13 @@ def generate_screenshots(files_to_process: List[Tuple[Path, str]], images_dir: P browser.close() -def update_html_files(files_to_process: List[Tuple[Path, str]]): - """Add og:image and twitter:image meta tags to HTML files.""" - for html_path, image_name in files_to_process: +def update_html_files(files_with_missing_images: List[Tuple[Path, str]], + files_needing_canonical_fix: List[Path], + base_url: str = "https://cheatsheets.davidveksler.com/"): + """Add og:image, twitter:image meta tags and fix canonical URLs.""" + + # Process files missing social media images + for html_path, image_name in files_with_missing_images: try: with open(html_path, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f.read(), 'html.parser') @@ -88,28 +116,89 @@ def update_html_files(files_to_process: List[Tuple[Path, str]]): with open(html_path, 'w', encoding='utf-8') as f: f.write(str(soup)) - logging.info(f"Updated: {html_path.name}") + logging.info(f"Updated social media tags: {html_path.name}") + + except Exception as e: + logging.error(f"Social media update failed for {html_path.name}: {e}") + + # Process files needing canonical URL fixes + for html_path in files_needing_canonical_fix: + try: + with open(html_path, 'r', encoding='utf-8', errors='ignore') as f: + soup = BeautifulSoup(f.read(), 'html.parser') + + # Ensure head exists + if not soup.head: + if soup.html: + soup.html.insert(0, soup.new_tag('head')) + else: + logging.warning(f"No html tag in {html_path.name}, skipping") + continue + + expected_canonical = f"{base_url}{html_path.name}" + + # Handle canonical URL + canonical_link = soup.find('link', rel='canonical') + if canonical_link: + # Fix existing canonical + current_href = canonical_link.get('href', '') + + # Fix http to https + if current_href.startswith('http://cheatsheets.davidveksler.com/'): + current_href = current_href.replace('http://cheatsheets.davidveksler.com/', 'https://cheatsheets.davidveksler.com/') + + # Update if different from expected + if current_href != expected_canonical: + canonical_link['href'] = expected_canonical + logging.info(f"Fixed canonical URL in {html_path.name}: {expected_canonical}") + else: + # Add missing canonical + canonical_tag = soup.new_tag('link', rel='canonical', href=expected_canonical) + # Insert canonical near the top of head, after charset and viewport + insert_position = 0 + for i, child in enumerate(soup.head.children): + if hasattr(child, 'name') and child.name == 'meta': + if child.get('charset') or child.get('name') == 'viewport': + insert_position = i + 1 + + if insert_position < len(list(soup.head.children)): + soup.head.insert(insert_position, canonical_tag) + else: + soup.head.append(canonical_tag) + + logging.info(f"Added canonical URL to {html_path.name}: {expected_canonical}") + + # Write back with minimal formatting changes + with open(html_path, 'w', encoding='utf-8') as f: + f.write(str(soup)) + + logging.info(f"Updated canonical URL: {html_path.name}") except Exception as e: - logging.error(f"Update failed for {html_path.name}: {e}") + logging.error(f"Canonical URL update failed for {html_path.name}: {e}") -def process_directory(directory: str = '.'): +def process_directory(directory: str = '.', base_url: str = "https://cheatsheets.davidveksler.com/"): """Main pipeline: analyze, screenshot, update.""" dir_path = Path(directory).resolve() images_dir = dir_path / 'images' logging.info(f"Processing directory: {dir_path}") - files_to_process = analyze_html_files(dir_path) + files_with_missing_images, files_needing_canonical_fix = analyze_html_files(dir_path, base_url) - if not files_to_process: + if not files_with_missing_images and not files_needing_canonical_fix: logging.info("No files need processing") return - logging.info(f"Found {len(files_to_process)} files to process") + logging.info(f"Found {len(files_with_missing_images)} files needing screenshots") + logging.info(f"Found {len(files_needing_canonical_fix)} files needing canonical URL fixes") + + # Generate screenshots only for files missing social media images + if files_with_missing_images: + generate_screenshots(files_with_missing_images, images_dir) - generate_screenshots(files_to_process, images_dir) - update_html_files(files_to_process) + # Update HTML files (both social media tags and canonical URLs) + update_html_files(files_with_missing_images, files_needing_canonical_fix, base_url) logging.info("Processing complete")