Website Content Crawler

Name: Website Content Crawler Data Collector | Firecrawl Prometheus
Creator: sideguide
Published: 2026-06-13T21:57:53.768Z
License: https://opensource.org/licenses/MIT

v1Published

Crawl an entire website into clean markdown — one entry per page with title, URL, and content. Ideal for AI/RAG ingestion. Parameters: url, limit.

Output & API

Preview the latest data, download it, or call this collector as an API.

Author's sample data

pages
startUrl	https://firecrawl.dev/
crawledAt	2026-06-13T21:57:42.030Z
pageCount	4

Parameters

--urlstringrequiredThe starting website URL to crawl (must be http or https). e.g. "https://firecrawl.dev"

--limitnumberMaximum number of pages to crawl. default 20 · e.g. 4

Marketplace

Publish this collector so others can deploy it — you keep ownership.

0 subscribers

sideguide@sideguide

0 runs in 14d · published 6w ago

Versions

Every build and self-heal appends a version. Pin one to lock runs to it.

managed by author

v1builtapprovedcurrent6w ago

How this script collects data

import Firecrawl from "@mendable/firecrawl-js";
import { parseArgs } from "node:util";

/**
 * Site-to-markdown crawler.
 *
 * Given a starting website URL, this crawls the site (following links within
 * it) and returns clean markdown content for each page it visits, along with
 * the page title and URL.
 *
 * Strategy: a single Firecrawl crawl job starting at the supplied URL, with
 * each discovered page scraped to the "markdown" format. Firecrawl handles
 * link discovery and rendering; we read each returned document's markdown body
 * and its metadata (title + resolved source URL) and emit one record per page.
 *
 * Output shape:
 *   {
 *     startUrl,            // the URL the crawl started from
 *     crawledAt,           // ISO timestamp of this run
 *     pageCount,           // number of pages returned
 *     pages: [             // one entry per crawled page
 *       { url, title, markdown }
 *     ]
 *   }
 */

interface Page {
  url: string;
  title: string | null;
  markdown: string;
}

interface Output {
  startUrl: string;
  crawledAt: string;
  pageCount: number;
  pages: Page[];
}

function cleanText(v: unknown): string | null {
  if (typeof v !== "string") return null;
  const t = v.replace(/\s+/g, " ").trim();
  return t.length > 0 ? t : null;
}

async function main(): Promise<void> {
  const { values } = parseArgs({
    strict: true,
    options: {
      url: { type: "string" },
      limit: { type: "string" },
    },
  });

  const rawUrl = (values.url ?? "").trim();
  if (rawUrl.length === 0) {
    console.error("Missing required parameter: --url=<starting website URL>");
    process.exit(1);
  }

  let target: URL;
  try {
    target = new URL(rawUrl);
  } catch {
    throw new Error(`OUT_OF_SCOPE: not a valid URL: ${rawUrl}`);
  }
  if (target.protocol !== "http:" && target.protocol !== "https:") {
    throw new Error(`OUT_OF_SCOPE: URL must use http or https: ${rawUrl}`);
  }

  // Optional page cap; defaults to 20.
  let limit = 20;
  if (values.limit !== undefined) {
    const parsed = Number(values.limit);
    if (!Number.isFinite(parsed) || !Number.isInteger(parsed) || parsed < 1) {
      console.error("--limit must be a positive integer");
      process.exit(1);
    }
    limit = parsed;
  }

  const apiKey = process.env.FIRECRAWL_API_KEY;
  if (!apiKey) {
    console.error("FIRECRAWL_API_KEY environment variable is not set");
    process.exit(1);
  }

  const firecrawl = new Firecrawl({ apiKey });

  console.error(`Crawling ${target.toString()} (limit=${limit})`);
  const res = (await firecrawl.crawl(target.toString(), {
    limit,
    scrapeOptions: { formats: ["markdown"], onlyMainContent: true },
    integration: "prometheus",
  })) as {
    status?: string;
    data?: Array<{ markdown?: string; metadata?: Record<string, unknown> }>;
  };

  const docs = res.data ?? [];
  if (docs.length === 0) {
    throw new Error(
      `crawl returned no pages for ${target.toString()} (site may be empty, bot-blocked, or unreachable)`,
    );
  }

  const pages: Page[] = [];
  const seen = new Set<string>();
  for (const doc of docs) {
    const metadata = doc.metadata ?? {};
    const url =
      cleanText(metadata.url) ??
      cleanText(metadata.sourceURL) ??
      target.toString();
    if (seen.has(url)) continue;
    seen.add(url);
    pages.push({
      url,
      title: cleanText(metadata.title) ?? cleanText(metadata.ogTitle),
      markdown: typeof doc.markdown === "string" ? doc.markdown : "",
    });
  }

  const out: Output = {
    startUrl: target.toString(),
    crawledAt: new Date().toISOString(),
    pageCount: pages.length,
    pages,
  };

  console.error(`Done: ${pages.length} pages`);
  process.stdout.write(JSON.stringify(out));
}

main().catch((err) => {
  console.error(err instanceof Error ? err.message : String(err));
  process.exit(1);
});

deploy to unlock

Deploy this collector to unlock schedules, the API endpoint, and destinations.