#!/bin/bash
#
# Fetches all Prince XML documentation pages and combines them into
# a single HTML file suitable for LLM consumption.
#
# Usage: ./build-single-page.sh [output-file]
#
# Requires: curl, sed, awk
#

set -euo pipefail

OUTPUT="${1:-prince-doc-single.html}"
BASE="https://www.princexml.com/doc"

# All doc pages in reading order
PAGES=(
    # Installation Guide
    installing
    first-doc
    help-install
    # User Guide
    intro-userguide
    styling
    paged
    gen-content
    javascript
    graphics
    cookbook
    help
    prince-input
    prince-output
    prince-networking
    server-integration
    prince-for-books
    # Reference Guide
    css-length-units
    css-props
    css-selectors
    css-media-queries
    css-functions
    css-at-rules
    css-color-names
    css-refs
    js-support
    command-line
    page-size-keywords
    characters
    acknowledgements
)

TMPDIR=$(mktemp -d)
trap 'rm -rf "$TMPDIR"' EXIT

echo "Fetching ${#PAGES[@]} documentation pages..."

# Fetch all pages (with a small delay to be polite)
for slug in "${PAGES[@]}"; do
    echo "  Fetching: $slug"
    curl -s -L "$BASE/$slug" -o "$TMPDIR/$slug.html"
    sleep 0.3
done

echo "Extracting content and building single page..."

# Start the combined document
cat > "$OUTPUT" <<'HEADER'
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Prince XML Documentation (Single Page)</title>
<style>
body { font-family: system-ui, sans-serif; max-width: 50em; margin: 2em auto; padding: 0 1em; line-height: 1.5; }
h1 { border-bottom: 2px solid #333; padding-bottom: 0.3em; }
h2 { border-bottom: 1px solid #ccc; padding-bottom: 0.2em; margin-top: 2em; }
section { margin-bottom: 3em; }
pre, code { background: #f4f4f4; }
pre { padding: 1em; overflow-x: auto; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ccc; padding: 0.4em 0.8em; text-align: left; }
nav ul { column-count: 2; }
nav li { margin-bottom: 0.3em; }
.separator { border: none; border-top: 2px solid #999; margin: 3em 0; }
</style>
</head>
<body>
<h1>Prince XML Documentation</h1>
<nav>
<h2>Table of Contents</h2>
<ul>
HEADER

# Build TOC
for slug in "${PAGES[@]}"; do
    # Extract the page title from <title> tag
    title=$(sed -n 's/.*<title>\(.*\)<\/title>.*/\1/p' "$TMPDIR/$slug.html" | head -1 | sed 's/ *| *Prince.*//' | sed 's/^ *//')
    if [ -z "$title" ]; then
        title="$slug"
    fi
    echo "<li><a href=\"#$slug\">$title</a></li>" >> "$OUTPUT"
done

cat >> "$OUTPUT" <<'MIDHEADER'
</ul>
</nav>
MIDHEADER

# Extract main content from each page and append
for slug in "${PAGES[@]}"; do
    title=$(sed -n 's/.*<title>\(.*\)<\/title>.*/\1/p' "$TMPDIR/$slug.html" | head -1 | sed 's/ *| *Prince.*//' | sed 's/^ *//')
    if [ -z "$title" ]; then
        title="$slug"
    fi

    echo "<hr class=\"separator\">" >> "$OUTPUT"
    echo "<section id=\"$slug\">" >> "$OUTPUT"

    # Try to extract the main content area.
    # Prince docs typically use <main> or a content div.
    # We use awk to grab content between likely markers.
    # Strategy: extract <main>...</main>, falling back to <article>,
    # falling back to a class="content" div.
    extracted=""

    # Try <main>
    if grep -q '<main' "$TMPDIR/$slug.html"; then
        extracted=$(awk '/<main/,/<\/main>/' "$TMPDIR/$slug.html")
    fi

    # Try article
    if [ -z "$extracted" ] && grep -q '<article' "$TMPDIR/$slug.html"; then
        extracted=$(awk '/<article/,/<\/article>/' "$TMPDIR/$slug.html")
    fi

    # Try div with content/doc class
    if [ -z "$extracted" ]; then
        extracted=$(awk '/class="[^"]*content[^"]*"/,/<\/div>/' "$TMPDIR/$slug.html" | head -2000)
    fi

    # Fallback: strip head/nav/footer and use the body
    if [ -z "$extracted" ]; then
        extracted=$(awk '/<body/,/<\/body>/' "$TMPDIR/$slug.html" \
            | sed '/<nav/,/<\/nav>/d' \
            | sed '/<header/,/<\/header>/d' \
            | sed '/<footer/,/<\/footer>/d')
    fi

    # Rewrite relative links to point to anchors within this document
    # and fix image URLs to be absolute
    extracted=$(echo "$extracted" \
        | sed "s|href=\"/doc/\([^\"]*\)\"|href=\"#\1\"|g" \
        | sed "s|src=\"/|src=\"https://www.princexml.com/|g")

    echo "$extracted" >> "$OUTPUT"
    echo "</section>" >> "$OUTPUT"
done

# Close the document
cat >> "$OUTPUT" <<'FOOTER'
</body>
</html>
FOOTER

# Report size
SIZE=$(wc -c < "$OUTPUT" | tr -d ' ')
LINES=$(wc -l < "$OUTPUT" | tr -d ' ')
echo ""
echo "Done! Created: $OUTPUT"
echo "  Size: $(( SIZE / 1024 )) KB ($LINES lines)"
echo ""
echo "Tip: For LLM use, you may want a plain-text version:"
echo "  pandoc -f html -t plain $OUTPUT -o prince-doc.txt"
echo "  or: lynx -dump $OUTPUT > prince-doc.txt"
