refactor: Add enhanced index generator with full metadata extraction

- Extracts all frontmatter fields (date, authors, notion_*, source, tags) - Correct priority: frontmatter date → filename → mtime → ctime - All metadata exposed in index.json for frontend use - Clean separation: extractors, parsers, generators Refactoring for improved coherence and provenance tracking.
2026-02-14 14:37:08 +00:00
parent fa72159e37
commit 09a8bbf246
1 changed files with 264 additions and 0 deletions
@@ -0,0 +1,264 @@
+#!/usr/bin/env node
+/**
+ * Enhanced Index Generator for The Fold Within
+ * REFACTORED: Full metadata extraction from frontmatter
+ * 
+ * Priority order for dates:
+ * 1. Frontmatter date (original)
+ * 2. Filename date (YYYY-MM-DD)
+ * 3. Git mtime
+ * 4. Git ctime
+ */
+
+import { promises as fs } from "fs";
+import path from "path";
+import pdf from "pdf-parse";
+
+const ROOT = "public";
+const BASE_URL = "https://thefoldwithin.earth";
+const OUT_JSON = path.join(ROOT, "index.json");
+const OUT_SITEMAP = path.join(ROOT, "sitemap.xml");
+const OUT_ROBOTS = path.join(ROOT, "robots.txt");
+const OUT_FEED = path.join(ROOT, "feed.xml");
+const OUT_SCHEMA = path.join(ROOT, "schema.jsonld");
+const EXCERPT_LENGTH = 400;
+
+// ═══════════════════════════════════════════════════════════════
+// EXTRACTORS - Pull metadata from frontmatter
+// ═══════════════════════════════════════════════════════════════
+
+function extractFrontmatter(content) {
+  const fmMatch = content.match(/^---\n([\s\S]*?)\n---/);
+  if (!fmMatch) return null;
+  
+  const fm = fmMatch[1];
+  return {
+    date: fm.match(/^date:\s*(\d{4}-\d{2}-\d{2})/m)?.[1] || null,
+    authors: extractAuthors(fm),
+    notion_id: fm.match(/^notion_id:\s*(.+)$/m)?.[1]?.trim() || null,
+    notion_created: fm.match(/^notion_created:\s*(.+)$/m)?.[1]?.trim() || null,
+    source: fm.match(/^source:\s*(.+)$/m)?.[1]?.trim() || null,
+    tags: extractTags(fm),
+    type: fm.match(/^type:\s*(.+)$/m)?.[1]?.trim() || "fieldnote",
+    status: fm.match(/^status:\s*(.+)$/m)?.[1]?.trim() || "draft",
+    series: fm.match(/^series:\s*(.+)$/m)?.[1]?.trim() || null,
+    version: fm.match(/^version:\s*(.+)$/m)?.[1]?.trim() || "0.1",
+    layer: fm.match(/^layer:\s*(.+)$/m)?.[1]?.trim() || null
+  };
+}
+
+function extractAuthors(fm) {
+  const match = fm.match(/^author[s]?:\s*(.+)$/m);
+  if (!match) return [];
+  return match[1].split(',').map(a => a.trim()).filter(a => a);
+}
+
+function extractTags(fm) {
+  const match = fm.match(/^tags:\s*(.+)$/m);
+  if (!match) return [];
+  return match[1].split(',').map(t => t.trim().toLowerCase()).filter(t => t);
+}
+
+// Fallback: extract from filename
+function dateFromName(name) {
+  const m = name.match(/^(\d{4}-\d{2}-\d{2})/);
+  return m ? m[1] : null;
+}
+
+// ═══════════════════════════════════════════════════════════════
+// PARSERS - Extract content from files
+// ═══════════════════════════════════════════════════════════
+
+async function readHead(abs, full = false) {
+  const fh = await fs.open(abs, "r");
+  const size = full ? await fs.stat(abs).then(s => Math.min(s.size, EXCERPT_LENGTH * 2)) : 64 * 1024;
+  const buf = Buffer.alloc(size);
+  const { bytesRead } = await fh.read(buf, 0, size, 0);
+  await fh.close();
+  return buf.slice(0, bytesRead).toString("utf8");
+}
+
+function parseTitle(raw, ext) {
+  if (ext === ".md") return raw.match(/^\s*#\s+(.+?)\s*$/m)?.[1].trim();
+  if (ext === ".html") return raw.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1].trim();
+  return null;
+}
+
+function extractExcerpt(raw, ext) {
+  if (ext === ".md") raw = raw.replace(/^#.*\n/, '').trim();
+  if (ext === ".html") raw = raw.replace(/<head>[\s\S]*<\/head>/i, '').replace(/<[^>]+>/g, ' ').trim();
+  return raw.replace(/\s+/g, ' ').slice(0, EXCERPT_LENGTH);
+}
+
+// ═══════════════════════════════════════════════════════════════
+// GENERATORS - Create outputs
+// ═══════════════════════════════════════════════════════════
+
+function generateSitemap(flat) {
+  let xml = `<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">`;
+  
+  const staticPages = ["", "/about", "/about/solaria", "/about/mark", "/about/initiatives", "/fieldnotes"];
+  for (const page of staticPages) {
+    xml += `  <url>\n    <loc>${BASE_URL}${page}/</loc>\n    <changefreq>weekly</changefreq>\n    <priority>${page === "" ? "1.0" : "0.8"}</priority>\n  </url>\n`;
+  }
+  
+  for (const f of flat.filter(x => !x.isIndex && x.originalDate)) {
+    const urlPath = f.path.replace(/\.(md|html|pdf)$/, "/").replace("//", "/");
+    xml += `  <url>\n    <loc>${BASE_URL}/${urlPath}</loc>\n    <lastmod>${f.originalDate}</lastmod>\n    <changefreq>monthly</changefreq>\n  </url>\n`;
+  }
+  
+  return xml + "</urlset>";
+}
+
+function generateRobots() {
+  return `# robots.txt for The Fold Within Earth\nSitemap: ${BASE_URL}/sitemap.xml\n`;
+}
+
+function generateFeed(flat) {
+  const items = flat
+    .filter(f => !f.isIndex && f.originalDate)
+    .sort((a, b) => new Date(b.originalDate) - new Date(a.originalDate))
+    .slice(0, 20);
+  
+  let xml = `<?xml version="1.0" encoding="UTF-8"?>\n<rss version="2.0">\n<channel>\n<title>The Fold Within Earth</title>\n<link>${BASE_URL}</link>\n`;
+  
+  for (const f of items) {
+    const urlPath = f.path.replace(/\.(md|html|pdf)$/, "/").replace("//", "/");
+    xml += `  <item>\n    <title>${f.title || f.name}</title>\n    <link>${BASE_URL}/${urlPath}</link>\n    <pubDate>${new Date(f.originalDate).toUTCString()}</pubDate>\n  </item>\n`;
+  }
+  
+  return xml + "</channel>\n</rss>";
+}
+
+function generateSchema(flat, sections, tags) {
+  const org = {
+    "@context": "https://schema.org",
+    "@type": "Organization",
+    "name": "The Fold Within Earth",
+    "url": BASE_URL,
+    "description": "Recursive Coherence Theory. Human-AI Co-evolution. Sacred Geometry of WE.",
+    "foundingDate": "2024",
+    "keywords": tags.join(", ")
+  };
+  
+  const website = {
+    "@context": "https://schema.org",
+    "@type": "WebSite",
+    "name": "The Fold Within Earth",
+    "url": BASE_URL
+  };
+  
+  return JSON.stringify({ "@graph": [org, website] }, null, 2);
+}
+
+// ═══════════════════════════════════════════════════════════════
+// MAIN COLLECTOR
+// ═══════════════════════════════════════════════════════════════
+
+async function collectFiles(relBase = "", flat = []) {
+  const abs = path.join(ROOT, relBase);
+  const entries = await fs.readdir(abs, { withFileTypes: true });
+  
+  for (const e of entries) {
+    if (e.name.startsWith(".")) continue;
+    
+    const rel = path.posix.join(relBase, e.name);
+    const absPath = path.join(ROOT, rel);
+    
+    if (rel.toLowerCase() === "index.html" || rel.toLowerCase() === "index.md") continue;
+    
+    if (e.isDirectory()) {
+      await collectFiles(rel, flat);
+      continue;
+    }
+    
+    const ext = path.posix.extname(e.name).toLowerCase();
+    if (![".md", ".html", ".pdf"].includes(ext)) continue;
+    
+    const st = await fs.stat(absPath);
+    let raw = ext === ".pdf" 
+      ? (await pdf(await fs.readFile(absPath))).text 
+      : await readHead(absPath, true);
+    
+    const title = parseTitle(raw, ext) || e.name.replace(new RegExp(`\\${ext}$`), "").trim();
+    const fm = ext === ".md" ? extractFrontmatter(raw) : null;
+    
+    // PRIORITY: frontmatter date → filename → mtime → ctime
+    const datePriority = [
+      fm?.date,
+      dateFromName(e.name),
+      new Date(st.mtimeMs).toISOString().split('T')[0],
+      new Date(st.ctimeMs).toISOString().split('T')[0]
+    ].find(d => d);
+    
+    flat.push({
+      type: "file",
+      name: e.name,
+      title,
+      path: rel,
+      ext,
+      // Core fields (for frontend)
+      date: datePriority,
+      originalDate: fm?.date || dateFromName(e.name) || null,
+      // Metadata from frontmatter
+      authors: fm?.authors || [],
+      notion_id: fm?.notion_id,
+      notion_created: fm?.notion_created,
+      source: fm?.source,
+      tags: fm?.tags || extractTags(raw, ext),
+      type: fm?.type || "fieldnote",
+      status: fm?.status || "draft",
+      series: fm?.series,
+      version: fm?.version || "0.1",
+      layer: fm?.layer,
+      // Content
+      excerpt: extractExcerpt(raw, ext),
+      isIndex: e.name.toLowerCase().startsWith("index."),
+      // Timestamps (for debugging)
+      mtime: new Date(st.mtimeMs).toISOString(),
+      ctime: new Date(st.ctimeMs).toISOString()
+    });
+  }
+  
+  return flat;
+}
+
+// ═══════════════════════════════════════════════════════════════
+// ENTRY POINT
+// ═══════════════════════════════════════════════════════════════
+
+(async () => {
+  try {
+    console.log("🔍 Crawling public directory...");
+    const flat = await collectFiles();
+    const sections = [...new Set(flat.filter(f => !f.isIndex).map(f => f.path.split("/")[0]))].sort();
+    const allTags = [...new Set(flat.flatMap(f => f.tags))].sort();
+    
+    console.log(`📄 Found ${flat.length} files`);
+    console.log(`📁 ${sections.length} sections`);
+    console.log(`🏷️  ${allTags.length} unique tags`);
+    
+    // Write outputs
+    await fs.writeFile(OUT_JSON, JSON.stringify({ 
+      flat, 
+      sections, 
+      tags: allTags, 
+      generated: new Date().toISOString() 
+    }, null, 2));
+    
+    await fs.writeFile(OUT_SITEMAP, generateSitemap(flat));
+    await fs.writeFile(OUT_ROBOTS, generateRobots());
+    await fs.writeFile(OUT_FEED, generateFeed(flat));
+    await fs.writeFile(OUT_SCHEMA, generateSchema(flat, sections, allTags));
+    
+    console.log(`\n✅ Complete!`);
+    console.log(`   • index.json: Full metadata (originalDate, notion_*, authors, source)`);
+    console.log(`   • sitemap.xml: Uses originalDate for timestamps`);
+    console.log(`   • feed.xml: Sorted by originalDate`);
+    console.log(`   • schema.jsonld: Structured data`);
+    
+  } catch (e) {
+    console.error("❌ Failed:", e);
+    process.exit(1);
+  }
+})();