And we're back

Sun, Dec 22, 2024

After setting up by bluesky to use a custom @johnpaulett handle (learn how set up your own domain handle), a friend pointed out my website was dead! Apparently sometime in the past 15 years (🤯) without touching it, my custom Django blog app written in Django v1.3 got disconnected from its Heroku Postgres.

Figured I should simplify the setup, since there is no need to have a database and app server (that costs $10/mo) to serve a blog that probably gets a few pageviews a month, a static generated site will work.

The Process:

Grabbed the blog_entry table as CSV from Heroku
Went to Claude AI to create a Python script to convert the csv with Restructured Text to Markdown (see blog-convert.py below). After a few successive prompts, had it working well! I had a some timezone errors in my orginal blog code in URL permalinks.
Grabbed images from archive.org (my S3 account had long been disable)
Minor cleanup of content
Tweaks to the hyde theme.
Github Action to deploy as a Github Pages static site

All in a productive Sunday!

blog-convert.py:

#!/usr/bin/env python3
import csv
import os
import subprocess
from datetime import datetime
import json
from collections import defaultdict
from zoneinfo import ZoneInfo


def convert_rst_to_md(content):
    """Convert reStructuredText to Markdown using pandoc."""
    try:
        # Write RST content to a temporary file
        with open("temp.rst", "w") as f:
            f.write(content)

        # Call pandoc to convert RST to Markdown
        result = subprocess.run(
            ["pandoc", "temp.rst", "-f", "rst", "-t", "markdown"],
            capture_output=True,
            text=True,
        )

        # Clean up temporary file
        os.remove("temp.rst")

        if result.returncode != 0:
            print(f"Error converting RST to Markdown: {result.stderr}")
            return content

        return result.stdout.strip()
    except Exception as e:
        print(f"Error during conversion: {e}")
        return content


def format_tags(tags_str):
    """Convert tags string into YAML array format."""
    if not tags_str:
        return []

    # First split by comma if present
    if "," in tags_str:
        tags = [tag.strip() for tag in tags_str.split(",")]
    else:
        # If no commas, split by spaces
        tags = [tag.strip() for tag in tags_str.split()]

    # Remove empty tags and ensure uniqueness
    return list(set(tag for tag in tags if tag))


def format_date(date_str):
    """Convert date string to ISO 8601 format."""
    try:
        # Parse the input datetime string
        # Add ':00' to the timezone offset to match ISO format
        if "+00" in date_str:
            date_str = date_str.replace("+00", "+0000")
        dt = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S%z")
        # Format to ISO 8601 with correct timezone format
        return dt.strftime("%Y-%m-%dT%H:%M:%S%z").replace("+0000", "+00:00")
    except ValueError as e:
        print(f"Error parsing date {date_str}: {e}")
        return date_str


def create_frontmatter(row):
    """Create Hugo frontmatter from CSV row."""
    # Parse the date for alias creation
    if "+00" in row["pub_date"]:
        pub_date = row["pub_date"].replace("+00", "+0000")
    dt = datetime.strptime(pub_date, "%Y-%m-%d %H:%M:%S%z")

    # Convert UTC to US/Central for the alias URL
    # First ensure we're working with UTC
    utc_dt = dt.replace(tzinfo=ZoneInfo("UTC"))
    central_dt = utc_dt.astimezone(ZoneInfo("America/Chicago"))

    # Create legacy URL alias using Central time
    legacy_url = (
        f"/{central_dt.year}/{central_dt.month:02d}/{central_dt.day:02d}/{row['slug']}/"
    )

    frontmatter = {
        "title": row["title"],
        "date": format_date(row["pub_date"]),
        "draft": row["public"] != "TRUE",
        "slug": row["slug"],
        "tags": format_tags(row["tags"]),
        "aliases": [legacy_url],
    }

    # Convert frontmatter to YAML-style string
    yaml = ["---"]
    for key, value in frontmatter.items():
        if isinstance(value, list):
            yaml.append(f"{key}:")
            for item in value:
                yaml.append(f"  - {item}")
        else:
            yaml.append(f"{key}: {json.dumps(value)}")
    yaml.append("---")

    return "\n".join(yaml)


def create_year_index(year):
    """Create _index.md file for a year directory."""
    content = f"""---
title: "{year}"
type: "yearly-archive"
---

Posts from {year}
"""
    return content


def get_post_year(pub_date):
    """Extract year from publication date."""
    try:
        if "+00" in pub_date:
            pub_date = pub_date.replace("+00", "+0000")
        dt = datetime.strptime(pub_date, "%Y-%m-%d %H:%M:%S%z")
        return str(dt.year)
    except ValueError as e:
        print(f"Error parsing date {pub_date}: {e}")
        return "unknown"


def create_directory(path):
    """Create directory if it doesn't exist."""
    os.makedirs(path, exist_ok=True)


def convert_csv_to_hugo(csv_path, published_dir, unpublished_dir):
    """Convert CSV entries to Hugo markdown files."""
    # Create main directories
    create_directory(published_dir)
    create_directory(unpublished_dir)

    # Keep track of years for index files
    years_used = set()

    with open(csv_path, "r", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)

        for row in reader:
            # Determine if post is draft
            is_draft = row["public"] != "TRUE"

            # Get the year and create year directory if needed
            year = get_post_year(row["pub_date"])

            # Determine base directory and full year path
            base_dir = unpublished_dir if is_draft else published_dir
            if not is_draft:
                year_dir = os.path.join(base_dir, year)
                create_directory(year_dir)
                years_used.add(year)
            else:
                year_dir = base_dir

            # Create filename from slug or fallback to sanitized title
            filename = (
                row["slug"] if row["slug"] else row["title"].lower().replace(" ", "-")
            )
            filename = f"{filename}.md"

            # Create full output path
            output_path = os.path.join(year_dir, filename)

            # Generate frontmatter
            frontmatter = create_frontmatter(row)

            # Convert description from RST to Markdown
            content = convert_rst_to_md(row.get("description", ""))

            # Combine frontmatter and content
            full_content = f"{frontmatter}\n\n{content}"

            # Write to file
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(full_content)

            print(f"Created {output_path}")

    # Create _index.md files for each year
    for year in sorted(years_used):
        index_path = os.path.join(published_dir, year, "_index.md")
        with open(index_path, "w", encoding="utf-8") as f:
            f.write(create_year_index(year))
        print(f"Created year index {index_path}")


if __name__ == "__main__":
    # Configuration
    CSV_PATH = "legacy/johnpaulettcom_blog_entry.csv"
    PUBLISHED_DIR = "content/posts"
    UNPUBLISHED_DIR = "legacy/unpublished"

    # Convert CSV to Hugo markdown files
    convert_csv_to_hugo(CSV_PATH, PUBLISHED_DIR, UNPUBLISHED_DIR)
    print("Conversion completed!")