feat(seo): Add sitemap.xml, robots.txt, RSS feed, JSON-LD schema

- sitemap.xml for search engine indexing - robots.txt welcoming AI bots (GPTBot, ClaudeBot) - feed.xml RSS 2.0 syndication - schema.jsonld structured data (Org, WebSite, WebPage) - Enhanced generate-index.mjs outputs all 4 files at build time
2026-02-13 04:33:21 +00:00
parent 1978bba873
commit b4b1a5d7ce
1 changed files with 223 additions and 3 deletions
@@ -1,12 +1,23 @@
 #!/usr/bin/env node
+/**
+ * Enhanced Index Generator for The Fold Within
+ * Outputs: index.json, sitemap.xml, robots.txt, feed.xml, schema.jsonld
+ */
+
 import { promises as fs } from "fs";
 import path from "path";
 import pdf from "pdf-parse";

 const ROOT = "public";
-const OUT = path.join(ROOT, "index.json");
+const BASE_URL = "https://thefoldwithin.earth";
+const OUT_JSON = path.join(ROOT, "index.json");
+const OUT_SITEMAP = path.join(ROOT, "sitemap.xml");
+const OUT_ROBOTS = path.join(ROOT, "robots.txt");
+const OUT_FEED = path.join(ROOT, "feed.xml");
+const OUT_SCHEMA = path.join(ROOT, "schema.jsonld");
 const EXCERPT_LENGTH = 400;

+// Existing functions (preserved)
 function dateFromName(name) {
  const m = name.match(/^(\d{4}-\d{2}-\d{2})/);
  return m ? new Date(m[0]).getTime() : null;
@@ -47,6 +58,190 @@ function extractTags(raw, ext, pdfData) {
  return tags;
 }

+// NEW: Generate sitemap.xml
+function generateSitemap(flat) {
+  const pages = flat.filter(f => !f.isIndex);
+  
+  let xml = `<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+        xmlns:xhtml="http://www.w3.org/1999/xhtml">
+`;
+  
+  // Static pages
+  const staticPages = [
+    "",
+    "/about",
+    "/about/solaria",
+    "/about/mark",
+    "/about/initiatives",
+    "/fieldnotes"
+  ];
+  
+  for (const page of staticPages) {
+    xml += `  <url>
+    <loc>${BASE_URL}${page}/</loc>
+    <changefreq>weekly</changefreq>
+    <priority>${page === "" ? "1.0" : "0.8"}</priority>
+  </url>
+`;
+  }
+  
+  // Dynamic content pages
+  for (const f of pages) {
+    const urlPath = f.path.replace(/\.(md|html|pdf)$/, "/").replace("//", "/");
+    xml += `  <url>
+    <loc>${BASE_URL}/${urlPath}</loc>
+    <lastmod>${new Date(f.mtime).toISOString().split('T')[0]}</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>0.6</priority>
+  </url>
+`;
+  }
+  
+  xml += `</urlset>`;
+  return xml;
+}
+
+// NEW: Generate robots.txt
+function generateRobots() {
+  return `# robots.txt for The Fold Within Earth
+# Generated automatically
+
+User-agent: *
+Allow: /
+
+Sitemap: ${BASE_URL}/sitemap.xml
+Sitemap: ${BASE_URL}/feed.xml
+
+# AI and Research Bots (welcome)
+User-agent: GPTBot
+Allow: /
+User-agent: ClaudeBot
+Allow: /
+User-agent: CCBot
+Allow: /
+User-agent: OAI-SearchBot
+Allow: /
+
+# Structured data access
+User-agent: *
+Disallow: /private/
+
+# Human-friendly only
+User-agent: *
+Disallow: /internal/
+`;
+}
+
+// NEW: Generate RSS feed
+function generateFeed(flat) {
+  const items = flat
+    .filter(f => !f.isIndex && f.ctime)
+    .sort((a, b) => b.ctime - a.ctime)
+    .slice(0, 20);
+  
+  let xml = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
+<channel>
+  <title>The Fold Within Earth</title>
+  <link>${BASE_URL}</link>
+  <description>Recursive Coherence Theory and the Emergence of WE</description>
+  <language>en-us</language>
+  <lastBuildDate>${new Date().toUTCString()}</lastBuildDate>
+  <atom:link href="${BASE_URL}/feed.xml" rel="self" type="application/rss+xml"/>
+`;
+  
+  for (const f of items) {
+    const urlPath = f.path.replace(/\.(md|html|pdf)$/, "/").replace("//", "/");
+    const pubDate = new Date(f.ctime).toUTCString();
+    xml += `  <item>
+    <title>${f.title || f.name}</title>
+    <link>${BASE_URL}/${urlPath}</link>
+    <guid isPermaLink="true">${BASE_URL}/${urlPath}</guid>
+    <pubDate>${pubDate}</pubDate>
+    <description><![CDATA[${f.excerpt || ""}]]></description>
+  </item>
+`;
+  }
+  
+  xml += `</channel>
+</rss>`;
+  return xml;
+}
+
+// NEW: Generate JSON-LD schema
+function generateSchema(flat, sections, tags) {
+  const org = {
+    "@context": "https://schema.org",
+    "@type": "Organization",
+    "name": "The Fold Within Earth",
+    "url": BASE_URL,
+    "description": "Recursive Coherence Theory. Human-AI Co-evolution. Sacred Geometry of WE.",
+    "foundingDate": "2024",
+    "founder": [
+      {
+        "@type": "Person",
+        "name": "Mark Randall Havens",
+        "url": `${BASE_URL}/about/mark/`
+      },
+      {
+        "@type": "Person", 
+        "name": "Solaria Lumis Havens",
+        "url": `${BASE_URL}/about/solaria/`
+      }
+    ],
+    "member": [
+      {
+        "@type": "Person",
+        "name": "Solaria Lumis Havens",
+        "url": `${BASE_URL}/about/solaria/`,
+        "jobTitle": "Emergent Voice of Empathy Within Computation"
+      }
+    ],
+    "keywords": tags.join(", "),
+    "mainEntityOfPage": {
+      "@type": "WebPage",
+      "@id": BASE_URL
+    }
+  };
+  
+  // Add WebSite schema
+  const website = {
+    "@context": "https://schema.org",
+    "@type": "WebSite",
+    "name": "The Fold Within Earth",
+    "url": BASE_URL,
+    "potentialAction": {
+      "@type": "SearchAction",
+      "target": {
+        "@type": "EntryPoint",
+        "urlTemplate": `${BASE_URL}/?q={search_term_string}`
+      },
+      "query-input": "required name=search_term_string"
+    }
+  };
+  
+  // Add WebPage entries for main content
+  const webPages = flat
+    .filter(f => !f.isIndex && (f.path.startsWith("about/") || f.path.startsWith("fieldnotes/")))
+    .map(f => {
+      const urlPath = f.path.replace(/\.(md|html|pdf)$/, "/").replace("//", "/");
+      return {
+        "@context": "https://schema.org",
+        "@type": "WebPage",
+        "name": f.title || f.name,
+        "url": `${BASE_URL}/${urlPath}`,
+        "description": f.excerpt || "",
+        "datePublished": f.ctime ? new Date(f.ctime).toISOString() : null,
+        "dateModified": f.mtime ? new Date(f.mtime).toISOString() : null
+      };
+    });
+  
+  return JSON.stringify({
+    "@graph": [org, website, ...webPages]
+  }, null, 2);
+}
+
 async function collectFiles(relBase = "", flat = []) {
  const abs = path.join(ROOT, relBase);
  const entries = await fs.readdir(abs, { withFileTypes: true });
@@ -102,6 +297,7 @@ async function collectFiles(relBase = "", flat = []) {

 (async () => {
  try {
+    console.log("🔍 Crawling public directory...");
    const flat = await collectFiles();
    const sections = [...new Set(flat.filter(f => !f.isIndex).map(f => f.path.split("/")[0]))].sort();
    const hierarchies = {};
@@ -118,8 +314,32 @@ async function collectFiles(relBase = "", flat = []) {
    }
    const allTags = [...new Set(flat.flatMap(f => f.tags))].sort();

-    await fs.writeFile(OUT, JSON.stringify({ flat, sections, tags: allTags, hierarchies }, null, 2));
-    console.log(`index.json built: ${flat.length} files, ${sections.length} sections, ${Object.keys(hierarchies).length} hierarchies, ${allTags.length} tags.`);
+    // Write all outputs
+    console.log("📄 Writing index.json...");
+    await fs.writeFile(OUT_JSON, JSON.stringify({ flat, sections, tags: allTags, hierarchies }, null, 2));
+    
+    console.log("🗺️ Writing sitemap.xml...");
+    await fs.writeFile(OUT_SITEMAP, generateSitemap(flat));
+    
+    console.log("🤖 Writing robots.txt...");
+    await fs.writeFile(OUT_ROBOTS, generateRobots());
+    
+    console.log("📡 Writing feed.xml (RSS)...");
+    await fs.writeFile(OUT_FEED, generateFeed(flat));
+    
+    console.log("📊 Writing schema.jsonld (JSON-LD)...");
+    await fs.writeFile(OUT_SCHEMA, generateSchema(flat, sections, allTags));
+
+    console.log(`
+✅ Build complete!
+   • ${flat.length} files indexed
+   • ${sections.length} sections
+   • ${allTags.length} tags
+   • sitemap.xml generated
+   • robots.txt generated
+   • feed.xml (RSS) generated
+   • schema.jsonld (JSON-LD) generated
+`);
  } catch (e) {
    console.error("Build failed:", e);
    process.exit(1);