Business Reviews
v1PublishedExtract customer reviews from any business or product review page — reviewer, star rating, date, and full text. Parameter: url.
Output & API
Preview the latest data, download it, or call this collector as an API.
| reviews | |
|---|---|
| pageTitle | Amazon.com Reviews from Real Customers |
| sourceUrl | https://www.consumeraffairs.com/online/amazon.html |
| reviewCount | 10 |
Parameters
--urlstringrequiredThe URL of the business or product review page to extract reviews from e.g. "https://www.consumeraffairs.com/online/amazon.html"
Marketplace
Publish this collector so others can deploy it — you keep ownership.
0 runs in 14d · published 5h ago
Versions
Every build and self-heal appends a version. Pin one to lock runs to it.
v1builtapprovedcurrent5h ago
How this script collects data
import Firecrawl from "@mendable/firecrawl-js";
import { parseArgs } from "node:util";
/**
* Extract individual customer reviews from a business/product review page.
*
* Given any review page URL, this scrapes the page and uses Firecrawl's
* schema-guided extraction to pull out each review's reviewer name, star
* rating, review date, and full review text. LLM-backed extraction is used
* deliberately: review markup differs wildly across sites (Trustpilot, G2,
* ConsumerAffairs, Amazon, Shopify stores, etc.), and per-review ratings are
* frequently encoded as images/aria-labels that no single CSS selector can
* read generically.
*
* Output (stdout, JSON only):
* {
* sourceUrl: string,
* pageTitle: string | null,
* reviewCount: number,
* reviews: Array<{
* reviewerName: string | null,
* starRating: number | null, // original scale (typically 1-5)
* reviewDate: string | null, // as displayed on the page
* reviewText: string | null
* }>
* }
*/
interface RawReview {
reviewerName?: unknown;
starRating?: unknown;
reviewDate?: unknown;
reviewText?: unknown;
}
interface Review {
reviewerName: string | null;
starRating: number | null;
reviewDate: string | null;
reviewText: string | null;
}
// Plain JSON Schema (not a Zod schema): the Firecrawl SDK's automatic Zod->JSON
// Schema conversion is unreliable for nested arrays, so we hand it a literal
// JSON Schema, which the extraction endpoint accepts directly.
const REVIEW_SCHEMA = {
type: "object",
properties: {
reviews: {
type: "array",
items: {
type: "object",
properties: {
reviewerName: {
type: ["string", "null"],
description: "Display name of the reviewer/customer who wrote the review",
},
starRating: {
type: ["number", "null"],
description:
"The star/numeric rating the reviewer gave, on its original scale (e.g. 1-5). Null if the review shows no rating.",
},
reviewDate: {
type: ["string", "null"],
description: "The date the review was written/posted, exactly as shown on the page",
},
reviewText: {
type: ["string", "null"],
description: "The complete body text of the review",
},
},
required: ["reviewerName", "starRating", "reviewDate", "reviewText"],
},
},
},
required: ["reviews"],
} as const;
const EXTRACTION_PROMPT =
"Extract every individual customer review listed on this page. For each review capture: " +
"the reviewer's display name, the star rating they gave (as a number on its original scale, e.g. 1-5), " +
"the review date exactly as displayed, and the full, complete review text. " +
"Only include genuine customer reviews — ignore navigation, ads, related products, and editorial copy.";
function toStringOrNull(v: unknown): string | null {
if (typeof v === "string") {
const t = v.trim();
return t.length > 0 ? t : null;
}
return null;
}
function toNumberOrNull(v: unknown): number | null {
if (typeof v === "number" && Number.isFinite(v)) return v;
if (typeof v === "string") {
const m = v.match(/-?\d+(\.\d+)?/);
if (m) {
const n = Number(m[0]);
if (Number.isFinite(n)) return n;
}
}
return null;
}
async function main(): Promise<void> {
const { values } = parseArgs({
options: {
url: { type: "string" },
},
strict: true,
});
const url = values.url;
if (!url || url.trim().length === 0) {
console.error("Missing required parameter: --url=<review page URL>");
process.exit(1);
}
let parsed: URL;
try {
parsed = new URL(url.trim());
} catch {
throw new Error(`OUT_OF_SCOPE: not a valid URL: ${url}`);
}
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
throw new Error(`OUT_OF_SCOPE: URL must be http(s): ${url}`);
}
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
console.error("FIRECRAWL_API_KEY environment variable is not set");
process.exit(1);
}
const firecrawl = new Firecrawl({ apiKey });
const res = (await firecrawl.scrape(parsed.toString(), {
formats: [{ type: "json", schema: REVIEW_SCHEMA as unknown as Record<string, unknown>, prompt: EXTRACTION_PROMPT }],
proxy: "auto",
integration: "prometheus",
} as Parameters<typeof firecrawl.scrape>[1])) as {
json?: { reviews?: RawReview[] };
metadata?: { title?: string };
};
if (!res || res.json === undefined || res.json === null) {
throw new Error(
`review extraction returned no structured data for ${parsed.toString()} (page may be bot-blocked, empty, or not a review page)`,
);
}
const rawReviews = Array.isArray(res.json.reviews) ? res.json.reviews : [];
const reviews: Review[] = rawReviews
.map((r): Review => ({
reviewerName: toStringOrNull(r?.reviewerName),
starRating: toNumberOrNull(r?.starRating),
reviewDate: toStringOrNull(r?.reviewDate),
reviewText: toStringOrNull(r?.reviewText),
}))
// Keep entries that carry at least the review text or a reviewer name.
.filter((r) => r.reviewText !== null || r.reviewerName !== null);
const out = {
sourceUrl: parsed.toString(),
pageTitle: toStringOrNull(res.metadata?.title),
reviewCount: reviews.length,
reviews,
};
process.stdout.write(JSON.stringify(out));
}
main().catch((err) => {
console.error(err instanceof Error ? err.message : String(err));
process.exit(1);
});
Deploy this collector to unlock schedules, the API endpoint, and destinations.