diff --git a/tools/generate-index-enhanced.mjs b/tools/generate-index-enhanced.mjs new file mode 100644 index 0000000..04d18bb --- /dev/null +++ b/tools/generate-index-enhanced.mjs @@ -0,0 +1,264 @@ +#!/usr/bin/env node +/** + * Enhanced Index Generator for The Fold Within + * REFACTORED: Full metadata extraction from frontmatter + * + * Priority order for dates: + * 1. Frontmatter date (original) + * 2. Filename date (YYYY-MM-DD) + * 3. Git mtime + * 4. Git ctime + */ + +import { promises as fs } from "fs"; +import path from "path"; +import pdf from "pdf-parse"; + +const ROOT = "public"; +const BASE_URL = "https://thefoldwithin.earth"; +const OUT_JSON = path.join(ROOT, "index.json"); +const OUT_SITEMAP = path.join(ROOT, "sitemap.xml"); +const OUT_ROBOTS = path.join(ROOT, "robots.txt"); +const OUT_FEED = path.join(ROOT, "feed.xml"); +const OUT_SCHEMA = path.join(ROOT, "schema.jsonld"); +const EXCERPT_LENGTH = 400; + +// ═══════════════════════════════════════════════════════════════ +// EXTRACTORS - Pull metadata from frontmatter +// ═══════════════════════════════════════════════════════════════ + +function extractFrontmatter(content) { + const fmMatch = content.match(/^---\n([\s\S]*?)\n---/); + if (!fmMatch) return null; + + const fm = fmMatch[1]; + return { + date: fm.match(/^date:\s*(\d{4}-\d{2}-\d{2})/m)?.[1] || null, + authors: extractAuthors(fm), + notion_id: fm.match(/^notion_id:\s*(.+)$/m)?.[1]?.trim() || null, + notion_created: fm.match(/^notion_created:\s*(.+)$/m)?.[1]?.trim() || null, + source: fm.match(/^source:\s*(.+)$/m)?.[1]?.trim() || null, + tags: extractTags(fm), + type: fm.match(/^type:\s*(.+)$/m)?.[1]?.trim() || "fieldnote", + status: fm.match(/^status:\s*(.+)$/m)?.[1]?.trim() || "draft", + series: fm.match(/^series:\s*(.+)$/m)?.[1]?.trim() || null, + version: fm.match(/^version:\s*(.+)$/m)?.[1]?.trim() || "0.1", + layer: fm.match(/^layer:\s*(.+)$/m)?.[1]?.trim() || null + }; +} + +function extractAuthors(fm) { + const match = fm.match(/^author[s]?:\s*(.+)$/m); + if (!match) return []; + return match[1].split(',').map(a => a.trim()).filter(a => a); +} + +function extractTags(fm) { + const match = fm.match(/^tags:\s*(.+)$/m); + if (!match) return []; + return match[1].split(',').map(t => t.trim().toLowerCase()).filter(t => t); +} + +// Fallback: extract from filename +function dateFromName(name) { + const m = name.match(/^(\d{4}-\d{2}-\d{2})/); + return m ? m[1] : null; +} + +// ═══════════════════════════════════════════════════════════════ +// PARSERS - Extract content from files +// ═══════════════════════════════════════════════════════════ + +async function readHead(abs, full = false) { + const fh = await fs.open(abs, "r"); + const size = full ? await fs.stat(abs).then(s => Math.min(s.size, EXCERPT_LENGTH * 2)) : 64 * 1024; + const buf = Buffer.alloc(size); + const { bytesRead } = await fh.read(buf, 0, size, 0); + await fh.close(); + return buf.slice(0, bytesRead).toString("utf8"); +} + +function parseTitle(raw, ext) { + if (ext === ".md") return raw.match(/^\s*#\s+(.+?)\s*$/m)?.[1].trim(); + if (ext === ".html") return raw.match(/]*>([^<]+)<\/title>/i)?.[1].trim(); + return null; +} + +function extractExcerpt(raw, ext) { + if (ext === ".md") raw = raw.replace(/^#.*\n/, '').trim(); + if (ext === ".html") raw = raw.replace(/[\s\S]*<\/head>/i, '').replace(/<[^>]+>/g, ' ').trim(); + return raw.replace(/\s+/g, ' ').slice(0, EXCERPT_LENGTH); +} + +// ═══════════════════════════════════════════════════════════════ +// GENERATORS - Create outputs +// ═══════════════════════════════════════════════════════════ + +function generateSitemap(flat) { + let xml = `\n`; + + const staticPages = ["", "/about", "/about/solaria", "/about/mark", "/about/initiatives", "/fieldnotes"]; + for (const page of staticPages) { + xml += ` \n ${BASE_URL}${page}/\n weekly\n ${page === "" ? "1.0" : "0.8"}\n \n`; + } + + for (const f of flat.filter(x => !x.isIndex && x.originalDate)) { + const urlPath = f.path.replace(/\.(md|html|pdf)$/, "/").replace("//", "/"); + xml += ` \n ${BASE_URL}/${urlPath}\n ${f.originalDate}\n monthly\n \n`; + } + + return xml + ""; +} + +function generateRobots() { + return `# robots.txt for The Fold Within Earth\nSitemap: ${BASE_URL}/sitemap.xml\n`; +} + +function generateFeed(flat) { + const items = flat + .filter(f => !f.isIndex && f.originalDate) + .sort((a, b) => new Date(b.originalDate) - new Date(a.originalDate)) + .slice(0, 20); + + let xml = `\n\n\nThe Fold Within Earth\n${BASE_URL}\n`; + + for (const f of items) { + const urlPath = f.path.replace(/\.(md|html|pdf)$/, "/").replace("//", "/"); + xml += ` \n ${f.title || f.name}\n ${BASE_URL}/${urlPath}\n ${new Date(f.originalDate).toUTCString()}\n \n`; + } + + return xml + "\n"; +} + +function generateSchema(flat, sections, tags) { + const org = { + "@context": "https://schema.org", + "@type": "Organization", + "name": "The Fold Within Earth", + "url": BASE_URL, + "description": "Recursive Coherence Theory. Human-AI Co-evolution. Sacred Geometry of WE.", + "foundingDate": "2024", + "keywords": tags.join(", ") + }; + + const website = { + "@context": "https://schema.org", + "@type": "WebSite", + "name": "The Fold Within Earth", + "url": BASE_URL + }; + + return JSON.stringify({ "@graph": [org, website] }, null, 2); +} + +// ═══════════════════════════════════════════════════════════════ +// MAIN COLLECTOR +// ═══════════════════════════════════════════════════════════════ + +async function collectFiles(relBase = "", flat = []) { + const abs = path.join(ROOT, relBase); + const entries = await fs.readdir(abs, { withFileTypes: true }); + + for (const e of entries) { + if (e.name.startsWith(".")) continue; + + const rel = path.posix.join(relBase, e.name); + const absPath = path.join(ROOT, rel); + + if (rel.toLowerCase() === "index.html" || rel.toLowerCase() === "index.md") continue; + + if (e.isDirectory()) { + await collectFiles(rel, flat); + continue; + } + + const ext = path.posix.extname(e.name).toLowerCase(); + if (![".md", ".html", ".pdf"].includes(ext)) continue; + + const st = await fs.stat(absPath); + let raw = ext === ".pdf" + ? (await pdf(await fs.readFile(absPath))).text + : await readHead(absPath, true); + + const title = parseTitle(raw, ext) || e.name.replace(new RegExp(`\\${ext}$`), "").trim(); + const fm = ext === ".md" ? extractFrontmatter(raw) : null; + + // PRIORITY: frontmatter date → filename → mtime → ctime + const datePriority = [ + fm?.date, + dateFromName(e.name), + new Date(st.mtimeMs).toISOString().split('T')[0], + new Date(st.ctimeMs).toISOString().split('T')[0] + ].find(d => d); + + flat.push({ + type: "file", + name: e.name, + title, + path: rel, + ext, + // Core fields (for frontend) + date: datePriority, + originalDate: fm?.date || dateFromName(e.name) || null, + // Metadata from frontmatter + authors: fm?.authors || [], + notion_id: fm?.notion_id, + notion_created: fm?.notion_created, + source: fm?.source, + tags: fm?.tags || extractTags(raw, ext), + type: fm?.type || "fieldnote", + status: fm?.status || "draft", + series: fm?.series, + version: fm?.version || "0.1", + layer: fm?.layer, + // Content + excerpt: extractExcerpt(raw, ext), + isIndex: e.name.toLowerCase().startsWith("index."), + // Timestamps (for debugging) + mtime: new Date(st.mtimeMs).toISOString(), + ctime: new Date(st.ctimeMs).toISOString() + }); + } + + return flat; +} + +// ═══════════════════════════════════════════════════════════════ +// ENTRY POINT +// ═══════════════════════════════════════════════════════════════ + +(async () => { + try { + console.log("🔍 Crawling public directory..."); + const flat = await collectFiles(); + const sections = [...new Set(flat.filter(f => !f.isIndex).map(f => f.path.split("/")[0]))].sort(); + const allTags = [...new Set(flat.flatMap(f => f.tags))].sort(); + + console.log(`📄 Found ${flat.length} files`); + console.log(`📁 ${sections.length} sections`); + console.log(`🏷️ ${allTags.length} unique tags`); + + // Write outputs + await fs.writeFile(OUT_JSON, JSON.stringify({ + flat, + sections, + tags: allTags, + generated: new Date().toISOString() + }, null, 2)); + + await fs.writeFile(OUT_SITEMAP, generateSitemap(flat)); + await fs.writeFile(OUT_ROBOTS, generateRobots()); + await fs.writeFile(OUT_FEED, generateFeed(flat)); + await fs.writeFile(OUT_SCHEMA, generateSchema(flat, sections, allTags)); + + console.log(`\n✅ Complete!`); + console.log(` • index.json: Full metadata (originalDate, notion_*, authors, source)`); + console.log(` • sitemap.xml: Uses originalDate for timestamps`); + console.log(` • feed.xml: Sorted by originalDate`); + console.log(` • schema.jsonld: Structured data`); + + } catch (e) { + console.error("❌ Failed:", e); + process.exit(1); + } +})();