Local Business Search
v1PublishedFind local businesses for a search query — name, address, phone, website, category, rating, and review count. Parameter: query.
Output & API
Preview the latest data, download it, or call this collector as an API.
| count | 30 |
|---|---|
| query | coffee shops in Austin TX |
| location | Austin TX |
| sourceUrl | https://www.yellowpages.com/search?search_terms=coffee+shops&geo_location_terms=Austin+TX |
| businesses | |
| searchTerms | coffee shops |
Marketplace
Publish this collector so others can deploy it — you keep ownership.
Versions
Every build and self-heal appends a version. Pin one to lock runs to it.
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";
/**
* Local-business search → structured listings.
*
* Given a natural-language local-business query (e.g. "coffee shops in
* Austin TX"), this scrapes the matching Yellow Pages search-results page and
* extracts, for each business: name, full address, phone, website, primary
* category, star rating, and number of reviews.
*
* Yellow Pages is used as the source because a single search-results page
* already carries every requested field in clean, server-rendered HTML, so the
* data can be picked out deterministically with CSS selectors (no LLM needed).
*
* The query is split into a "what" and a "where" on the last " in " token:
* "coffee shops in Austin TX" -> terms="coffee shops", location="Austin TX".
* If there is no " in " token, the whole query is used as the search terms with
* no location filter.
*
* Output (stdout, JSON only):
* {
* query: string,
* searchTerms: string,
* location: string | null,
* sourceUrl: string,
* count: number,
* businesses: Array<{
* name: string,
* address: string | null,
* phone: string | null,
* website: string | null,
* category: string | null, // primary (first) category
* rating: number | null, // stars, 0-5 scale
* reviewCount: number | null
* }>
* }
*/
interface Business {
name: string;
address: string | null;
phone: string | null;
website: string | null;
category: string | null;
rating: number | null;
reviewCount: number | null;
}
const STAR_WORDS: Record<string, number> = {
zero: 0,
one: 1,
two: 2,
three: 3,
four: 4,
five: 5,
};
function cleanText(v: string | undefined | null): string | null {
if (typeof v !== "string") return null;
const t = v.replace(/\s+/g, " ").trim();
return t.length > 0 ? t : null;
}
// Yellow Pages encodes the star rating as class words on .result-rating,
// e.g. "result-rating three half" -> 3.5, "result-rating five" -> 5.
function parseRating(classAttr: string | undefined): number | null {
if (!classAttr) return null;
const words = classAttr.toLowerCase().split(/\s+/);
let base: number | null = null;
let half = false;
for (const w of words) {
if (w in STAR_WORDS) base = STAR_WORDS[w];
else if (w === "half") half = true;
}
if (base === null) return null;
return half ? base + 0.5 : base;
}
// "(12)" -> 12 ; "" / no digits -> null
function parseReviewCount(v: string | null): number | null {
if (!v) return null;
const m = v.match(/\d[\d,]*/);
if (!m) return null;
const n = Number(m[0].replace(/,/g, ""));
return Number.isFinite(n) ? n : null;
}
// Split "coffee shops in Austin TX" -> { terms, location } on the LAST " in ".
function splitQuery(query: string): { terms: string; location: string | null } {
const q = query.trim();
const m = q.match(/^(.*\S)\s+in\s+(\S.*)$/i);
if (m) return { terms: m[1].trim(), location: m[2].trim() };
return { terms: q, location: null };
}
async function main(): Promise<void> {
const { values } = parseArgs({
options: {
query: { type: "string" },
},
strict: true,
});
const query = values.query;
if (!query || query.trim().length === 0) {
console.error("Missing required parameter: --query=<local-business search>");
process.exit(1);
}
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
console.error("FIRECRAWL_API_KEY environment variable is not set");
process.exit(1);
}
const { terms, location } = splitQuery(query);
// Build the Yellow Pages search URL. encodeURIComponent (encoding, not
// decoding) keeps the params safe; spaces become "+" for readability.
const params = [`search_terms=${encodeURIComponent(terms).replace(/%20/g, "+")}`];
if (location) {
params.push(`geo_location_terms=${encodeURIComponent(location).replace(/%20/g, "+")}`);
}
const sourceUrl = `https://www.yellowpages.com/search?${params.join("&")}`;
const firecrawl = new Firecrawl({ apiKey });
const res = (await firecrawl.scrape(sourceUrl, {
formats: ["html"],
onlyMainContent: false,
proxy: "auto",
integration: "prometheus",
} as Parameters<typeof firecrawl.scrape>[1])) as { html?: string };
const html = res?.html;
if (typeof html !== "string" || html.length === 0) {
throw new Error(`no HTML returned for ${sourceUrl} (page may be bot-blocked or empty)`);
}
const $ = cheerio.load(html);
const results = $("div.result");
if (results.length === 0) {
// In-scope page that simply yielded no listings (e.g. obscure query/location).
const empty = {
query,
searchTerms: terms,
location,
sourceUrl,
count: 0,
businesses: [] as Business[],
};
process.stdout.write(JSON.stringify(empty));
return;
}
const businesses: Business[] = [];
results.each((_, el) => {
const node = $(el);
const name = cleanText(node.find("a.business-name").first().text());
if (!name) return; // skip non-listing rows
const street = cleanText(node.find(".street-address").first().text());
const locality = cleanText(node.find(".locality").first().text());
const address = [street, locality].filter(Boolean).join(", ") || null;
const phone = cleanText(node.find(".phones.phone.primary").first().text());
const website = node.find("a.track-visit-website").first().attr("href") || null;
const category = cleanText(node.find(".categories a").first().text());
const rating = parseRating(node.find(".result-rating").first().attr("class"));
const reviewCount = parseReviewCount(cleanText(node.find(".ratings .count").first().text()));
businesses.push({ name, address, phone, website, category, rating, reviewCount });
});
if (businesses.length === 0) {
throw new Error(`found ${results.length} result blocks but none had a business name (page structure may have changed)`);
}
const out = {
query,
searchTerms: terms,
location,
sourceUrl,
count: businesses.length,
businesses,
};
process.stdout.write(JSON.stringify(out));
}
main().catch((err) => {
console.error(err instanceof Error ? err.message : String(err));
process.exit(1);
});
Deploy this collector to unlock schedules, the API endpoint, and destinations.