#!/bin/bash # # Fetches all Prince XML documentation pages and combines them into # a single HTML file suitable for LLM consumption. # # Usage: ./build-single-page.sh [output-file] # # Requires: curl, sed, awk # set -euo pipefail OUTPUT="${1:-prince-doc-single.html}" BASE="https://www.princexml.com/doc" # All doc pages in reading order PAGES=( # Installation Guide installing first-doc help-install # User Guide intro-userguide styling paged gen-content javascript graphics cookbook help prince-input prince-output prince-networking server-integration prince-for-books # Reference Guide css-length-units css-props css-selectors css-media-queries css-functions css-at-rules css-color-names css-refs js-support command-line page-size-keywords characters acknowledgements ) TMPDIR=$(mktemp -d) trap 'rm -rf "$TMPDIR"' EXIT echo "Fetching ${#PAGES[@]} documentation pages..." # Fetch all pages (with a small delay to be polite) for slug in "${PAGES[@]}"; do echo " Fetching: $slug" curl -s -L "$BASE/$slug" -o "$TMPDIR/$slug.html" sleep 0.3 done echo "Extracting content and building single page..." # Start the combined document cat > "$OUTPUT" <<'HEADER' Prince XML Documentation (Single Page)

Prince XML Documentation

tag title=$(sed -n 's/.*<title>$.*$<\/title>.*/\1/p' "$TMPDIR/$slug.html" | head -1 | sed 's/ *| *Prince.*//' | sed 's/^ *//') if [ -z "$title" ]; then title="$slug" fi echo "<li><a href=\"#$slug\">$title</a></li>" >> "$OUTPUT" done cat >> "$OUTPUT" <<'MIDHEADER' </ul> </nav> MIDHEADER # Extract main content from each page and append for slug in "${PAGES[@]}"; do title=$(sed -n 's/.*<title>$.*$<\/title>.*/\1/p' "$TMPDIR/$slug.html" | head -1 | sed 's/ *| *Prince.*//' | sed 's/^ *//') if [ -z "$title" ]; then title="$slug" fi echo "<hr class=\"separator\">" >> "$OUTPUT" echo "<section id=\"$slug\">" >> "$OUTPUT" # Try to extract the main content area. # Prince docs typically use <main> or a content div. # We use awk to grab content between likely markers. # Strategy: extract <main>...</main>, falling back to <article>, # falling back to a class="content" div. extracted="" # Try <main> if grep -q '<main' "$TMPDIR/$slug.html"; then extracted=$(awk '/<main/,/<\/main>/' "$TMPDIR/$slug.html") fi # Try article if [ -z "$extracted" ] && grep -q '<article' "$TMPDIR/$slug.html"; then extracted=$(awk '/<article/,/<\/article>/' "$TMPDIR/$slug.html") fi # Try div with content/doc class if [ -z "$extracted" ]; then extracted=$(awk '/class="[^"]*content[^"]*"/,/<\/div>/' "$TMPDIR/$slug.html" | head -2000) fi # Fallback: strip head/nav/footer and use the body if [ -z "$extracted" ]; then extracted=$(awk '/<body/,/<\/body>/' "$TMPDIR/$slug.html" \ | sed '/<nav/,/<\/nav>/d' \ | sed '/<header/,/<\/header>/d' \ | sed '/<footer/,/<\/footer>/d') fi # Rewrite relative links to point to anchors within this document # and fix image URLs to be absolute extracted=$(echo "$extracted" \ | sed "s|href=\"/doc/$[^\"]*$\"|href=\"#\1\"|g" \ | sed "s|src=\"/|src=\"https://www.princexml.com/|g") echo "$extracted" >> "$OUTPUT" echo "</section>" >> "$OUTPUT" done # Close the document cat >> "$OUTPUT" <<'FOOTER' </body> </html> FOOTER # Report size SIZE=$(wc -c < "$OUTPUT" | tr -d ' ') LINES=$(wc -l < "$OUTPUT" | tr -d ' ') echo "" echo "Done! Created: $OUTPUT" echo " Size: $(( SIZE / 1024 )) KB ($LINES lines)" echo "" echo "Tip: For LLM use, you may want a plain-text version:" echo " pandoc -f html -t plain $OUTPUT -o prince-doc.txt" echo " or: lynx -dump $OUTPUT > prince-doc.txt"