Alternative Real Estate Listing Tracker
v1PublishedPublic listing rows from selected real-estate and short-term-rental platforms with visible pricing, property facts, history signals, and rental-yield proxy fields when derivable.
Output & API
Preview the latest data, download it, or call this collector as an API.
| city | |
|---|---|
| query | 2 bedroom condos Miami |
| region | global |
| bedrooms | |
| listings | |
| platforms | |
| output_mode | listing_rows |
| price_range | |
| collected_at | 2026-07-03T05:20:41.963Z |
| max_listings | 3 |
| listing_count | 3 |
| property_type | |
| snapshot_label |
Marketplace
Publish this collector so others can deploy it — you keep ownership.
Versions
Every build and self-heal appends a version. Pin one to lock runs to it.
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";
type Platform = "zillow" | "redfin" | "airbnb";
type OutputMode = "listing_rows" | "grouped_by_platform";
type Candidate = {
platform: Platform;
url: string;
sourcePageUrl: string;
secondarySourceUrls: string[];
title: string | null;
description: string | null;
};
type ListingRow = {
tracking_key: string;
snapshot_label: string;
collected_at: string;
platform: Platform;
listing_id: string | null;
listing_url: string;
source_page_url: string;
secondary_source_urls: string[];
property_title: string | null;
address_text: string | null;
city: string | null;
region: string | null;
country: string | null;
property_type: string | null;
listing_status: string | null;
asking_price_text: string | null;
asking_price_normalized: number | null;
currency: string | null;
bedrooms: number | null;
bathrooms: number | null;
square_feet: number | null;
lot_size_text: string | null;
days_on_market_text: string | null;
days_on_market_normalized: number | null;
price_cut_text: string | null;
price_cut_amount: number | null;
estimated_rent_text: string | null;
nightly_rate_text: string | null;
fees_text: string | null;
review_count: number | null;
rating_value: number | null;
minimum_stay_text: string | null;
listing_history: string[] | null;
estimated_gross_yield: number | null;
estimated_net_yield: number | null;
confidence: "high" | "medium" | "low";
notes: string | null;
};
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
console.error("FIRECRAWL_API_KEY is not set");
process.exit(1);
}
const { values: flags } = parseArgs({
strict: true,
options: {
platforms: { type: "string" },
query: { type: "string" },
"seed-urls": { type: "string" },
"max-listings": { type: "string" },
"output-mode": { type: "string" },
region: { type: "string" },
city: { type: "string" },
"property-type": { type: "string" },
bedrooms: { type: "string" },
"price-range": { type: "string" },
"include-history": { type: "string" },
"include-rental-yield": { type: "string" },
"snapshot-label": { type: "string" },
"sort-hint": { type: "string" },
},
});
function requiredString(name: string): string {
const value = flags[name] as string | undefined;
if (value === undefined) {
console.error(`--${name} is required`);
process.exit(1);
}
return value.trim();
}
const platformsInput = requiredString("platforms");
const query = requiredString("query");
const seedUrlsInput = requiredString("seed-urls");
const maxListingsInput = requiredString("max-listings");
const outputModeInput = requiredString("output-mode");
const regionFilter = ((flags.region as string | undefined) ?? "global").trim() || "global";
const cityFilter = ((flags.city as string | undefined) ?? "").trim();
const propertyTypeFilter = ((flags["property-type"] as string | undefined) ?? "").trim();
const bedroomsFilter = ((flags.bedrooms as string | undefined) ?? "").trim();
const priceRangeFilter = ((flags["price-range"] as string | undefined) ?? "").trim();
const includeHistory = parseBooleanFlag((flags["include-history"] as string | undefined) ?? "true", "include-history");
const includeRentalYield = parseBooleanFlag((flags["include-rental-yield"] as string | undefined) ?? "true", "include-rental-yield");
const snapshotLabel = ((flags["snapshot-label"] as string | undefined) ?? new Date().toISOString()).trim();
const sortHint = ((flags["sort-hint"] as string | undefined) ?? "best match").trim() || "best match";
const platforms = parsePlatforms(platformsInput);
const seedUrls = seedUrlsInput.split(",").map((url) => url.trim()).filter(Boolean);
const maxListings = Number(maxListingsInput);
const outputMode = parseOutputMode(outputModeInput);
if (!query && seedUrls.length === 0) {
throw new Error("OUT_OF_SCOPE: at least one of --query or --seed-urls must be provided");
}
if (!Number.isInteger(maxListings) || maxListings < 1 || maxListings > 100) {
throw new Error("OUT_OF_SCOPE: --max-listings must be an integer from 1 to 100");
}
const firecrawl = new Firecrawl({ apiKey });
function parseBooleanFlag(value: string, name: string): boolean {
if (value !== "true" && value !== "false") {
throw new Error(`OUT_OF_SCOPE: --${name} must be "true" or "false"`);
}
return value === "true";
}
function parsePlatforms(value: string): Platform[] {
const parts = value.split(",").map((part) => part.trim().toLowerCase()).filter(Boolean);
if (parts.length === 0) {
throw new Error("OUT_OF_SCOPE: --platforms must include at least one platform");
}
const unique: Platform[] = [];
for (const part of parts) {
if (part !== "zillow" && part !== "redfin" && part !== "airbnb") {
throw new Error("OUT_OF_SCOPE: --platforms values must be zillow, redfin, or airbnb");
}
if (!unique.includes(part)) unique.push(part);
}
return unique;
}
function parseOutputMode(value: string): OutputMode {
if (value !== "listing_rows" && value !== "grouped_by_platform") {
throw new Error('OUT_OF_SCOPE: --output-mode must be "listing_rows" or "grouped_by_platform"');
}
return value;
}
function platformForUrl(url: string): Platform | null {
const lower = url.toLowerCase();
if (lower.includes("zillow.com")) return "zillow";
if (lower.includes("redfin.com")) return "redfin";
if (lower.includes("airbnb.com")) return "airbnb";
return null;
}
function isListingUrl(platform: Platform, url: string): boolean {
const lower = url.toLowerCase();
if (platform === "zillow") return /zillow\.com\/homedetails\/.+_zpid\/?/.test(lower);
if (platform === "redfin") return /redfin\.com\/.+\/home\/\d+/.test(lower);
return /airbnb\.[a-z.]+\/rooms\/\d+/.test(lower);
}
function canonicalUrl(platform: Platform, url: string): string {
const clean = url.split("#")[0].split("?")[0];
if (platform === "zillow") {
const match = clean.match(/^(https?:\/\/(?:www\.)?zillow\.com\/homedetails\/[^ ]+?_zpid)\/?/i);
return match ? `${match[1]}/` : clean;
}
if (platform === "redfin") {
const match = clean.match(/^(https?:\/\/(?:www\.)?redfin\.com\/.+?\/home\/\d+)/i);
return match ? match[1] : clean;
}
const match = clean.match(/^(https?:\/\/(?:www\.)?airbnb\.[a-z.]+\/rooms\/\d+)/i);
return match ? match[1].replace(/airbnb\.[a-z.]+/i, "airbnb.com") : clean;
}
function listingId(platform: Platform, url: string, metadata: Record<string, unknown>): string | null {
if (platform === "zillow") return firstMatch(url, /\/(\d+)_zpid\/?/);
if (platform === "redfin") return firstMatch(url, /\/home\/(\d+)/) ?? stringMeta(metadata, "al:ios:url")?.replace(/^redfin:\/\/home\//, "") ?? null;
return firstMatch(url, /\/rooms\/(\d+)/);
}
function searchQueryFor(platform: Platform): string {
const bits = [query, cityFilter, regionFilter !== "global" ? regionFilter : "", propertyTypeFilter, bedroomsFilter ? `${bedroomsFilter} bedrooms` : "", priceRangeFilter, sortHint]
.filter(Boolean)
.join(" ");
if (platform === "zillow") return `site:zillow.com/homedetails ${bits} Zillow home details`;
if (platform === "redfin") return `site:redfin.com ${bits} Redfin home details`;
return `site:airbnb.com/rooms ${bits} Airbnb rooms`;
}
async function discoverCandidates(): Promise<Candidate[]> {
const byKey = new Map<string, Candidate>();
for (const seedUrl of seedUrls) {
const platform = platformForUrl(seedUrl);
if (!platform || !platforms.includes(platform)) continue;
const canonical = canonicalUrl(platform, seedUrl);
if (isListingUrl(platform, canonical)) {
addCandidate(byKey, { platform, url: canonical, sourcePageUrl: seedUrl, secondarySourceUrls: [], title: null, description: null });
} else {
await addLinksFromPage(byKey, platform, seedUrl);
}
}
if (query) {
for (const platform of platforms) {
const searchResults = await firecrawl.search(searchQueryFor(platform), {
limit: Math.min(10, Math.max(3, maxListings * 2)),
integration: "prometheus",
});
const web = Array.isArray(searchResults.web) ? searchResults.web : [];
for (const result of web) {
const resultUrl = typeof result.url === "string" ? result.url : "";
if (!resultUrl) continue;
const resultPlatform = platformForUrl(resultUrl);
if (resultPlatform !== platform) continue;
const canonical = canonicalUrl(platform, resultUrl);
const title = typeof result.title === "string" ? result.title : null;
const description = typeof result.description === "string" ? result.description : null;
if (isListingUrl(platform, canonical)) {
addCandidate(byKey, { platform, url: canonical, sourcePageUrl: resultUrl, secondarySourceUrls: [], title, description });
} else if (byKey.size < maxListings * platforms.length * 3) {
await addLinksFromPage(byKey, platform, resultUrl);
}
}
}
}
const buckets = new Map<Platform, Candidate[]>();
for (const platform of platforms) buckets.set(platform, []);
for (const candidate of byKey.values()) buckets.get(candidate.platform)?.push(candidate);
const ordered: Candidate[] = [];
let index = 0;
while (ordered.length < byKey.size) {
let added = false;
for (const platform of platforms) {
const candidate = buckets.get(platform)?.[index];
if (candidate) {
ordered.push(candidate);
added = true;
}
}
if (!added) break;
index += 1;
}
return ordered;
}
async function addLinksFromPage(byKey: Map<string, Candidate>, platform: Platform, pageUrl: string): Promise<void> {
try {
const doc = await firecrawl.scrape(pageUrl, {
formats: ["html", "links"],
integration: "prometheus",
proxy: platform === "zillow" ? "stealth" : "auto",
});
const urls = new Set<string>();
for (const link of Array.isArray(doc.links) ? doc.links : []) urls.add(link);
const html = typeof doc.html === "string" ? doc.html : "";
const $ = cheerio.load(html);
$("a[href]").each((_, el) => {
const href = $(el).attr("href");
if (!href) return;
try {
urls.add(new URL(href, pageUrl).toString());
} catch {
// Ignore malformed hrefs from the source page.
}
});
for (const url of urls) {
const resultPlatform = platformForUrl(url);
if (resultPlatform !== platform) continue;
const canonical = canonicalUrl(platform, url);
if (isListingUrl(platform, canonical)) {
addCandidate(byKey, { platform, url: canonical, sourcePageUrl: pageUrl, secondarySourceUrls: [], title: null, description: null });
}
}
} catch (err) {
console.error(`Could not discover links from ${pageUrl}: ${err}`);
}
}
function addCandidate(byKey: Map<string, Candidate>, candidate: Candidate): void {
const key = `${candidate.platform}:${candidate.url}`;
const existing = byKey.get(key);
if (!existing) {
byKey.set(key, candidate);
return;
}
if (!existing.secondarySourceUrls.includes(candidate.sourcePageUrl) && existing.sourcePageUrl !== candidate.sourcePageUrl) {
existing.secondarySourceUrls.push(candidate.sourcePageUrl);
}
existing.title = existing.title ?? candidate.title;
existing.description = existing.description ?? candidate.description;
}
async function scrapeCandidate(candidate: Candidate, collectedAt: string): Promise<ListingRow> {
try {
const doc = await firecrawl.scrape(candidate.url, {
formats: ["markdown", "html"],
integration: "prometheus",
proxy: candidate.platform === "zillow" ? "stealth" : "auto",
});
return rowFromDocument(candidate, doc, collectedAt);
} catch (err) {
console.error(`Could not scrape ${candidate.url}: ${err}`);
return rowFromFallback(candidate, collectedAt, `Detail scrape failed; fields are limited to discovery result text.`);
}
}
function rowFromDocument(candidate: Candidate, doc: any, collectedAt: string): ListingRow {
const metadata = (doc.metadata ?? {}) as Record<string, unknown>;
const markdown = typeof doc.markdown === "string" ? doc.markdown : "";
const html = typeof doc.html === "string" ? doc.html : "";
const pageText = markdown || htmlToText(html);
const rawText = `${candidate.title ?? ""}\n${candidate.description ?? ""}\n${metadataText(metadata)}\n${pageText}`;
const text = compactText(rawText);
const title = extractTitle(candidate, metadata);
const address = extractAddress(candidate.platform, metadata, title, text);
const priceText = extractAskingPrice(candidate.platform, metadata, text);
const rentText = extractEstimatedRent(text);
const nightlyText = candidate.platform === "airbnb" ? extractNightlyRate(text) : null;
const price = normalizeMoney(priceText);
const monthlyRent = normalizeMoney(rentText);
const nightlyRate = normalizeMoney(nightlyText);
const history = includeHistory ? extractHistory(rawText) : null;
const grossYield = includeRentalYield ? computeGrossYield(price, monthlyRent, nightlyRate) : null;
const id = listingId(candidate.platform, candidate.url, metadata);
const derivedCity = extractCity(candidate.platform, metadata, address, text) ?? (cityFilter || null);
const derivedRegion = extractRegion(candidate.platform, metadata, address, text) ?? (regionFilter !== "global" ? regionFilter : null);
const country = extractCountry(candidate.platform, metadata, text);
const row: ListingRow = {
tracking_key: `${candidate.platform}:${id ?? candidate.url}`,
snapshot_label: snapshotLabel,
collected_at: collectedAt,
platform: candidate.platform,
listing_id: id,
listing_url: candidate.url,
source_page_url: candidate.sourcePageUrl,
secondary_source_urls: candidate.secondarySourceUrls,
property_title: title,
address_text: address,
city: derivedCity,
region: derivedRegion,
country,
property_type: extractPropertyType(candidate.platform, metadata, text),
listing_status: extractStatus(candidate.platform, text),
asking_price_text: priceText,
asking_price_normalized: price,
currency: currencyFromText(priceText ?? nightlyText ?? rentText),
bedrooms: extractNumber(metadata, ["twitter:text:beds", "zillow_fb:beds"]) ?? numberNear(text, /\b(\d+(?:\.\d+)?)\s*(?:bed|bedroom|bedrooms|beds)\b/i),
bathrooms: extractNumber(metadata, ["twitter:text:baths", "zillow_fb:baths"]) ?? numberNear(text, /\b(\d+(?:\.\d+)?)\s*(?:bath|bathroom|bathrooms|baths)\b/i),
square_feet: extractNumber(metadata, ["twitter:text:sqft"]) ?? numberNear(text, /\b([\d,]+)\s*(?:sq\.?\s*ft|square feet|sqft)\b/i),
lot_size_text: firstMatch(text, /([\d,.]+\s*(?:acre|acres|sq\.?\s*ft\.?)\s+lot)\b/i),
days_on_market_text: extractDaysOnMarketText(text),
days_on_market_normalized: normalizeDays(extractDaysOnMarketText(text)),
price_cut_text: extractPriceCutText(text),
price_cut_amount: normalizeMoney(extractPriceCutText(text)),
estimated_rent_text: rentText,
nightly_rate_text: nightlyText,
fees_text: candidate.platform === "airbnb" ? extractFees(text) : null,
review_count: candidate.platform === "airbnb" ? extractReviewCount(text) : null,
rating_value: candidate.platform === "airbnb" ? extractRating(text) : null,
minimum_stay_text: candidate.platform === "airbnb" ? extractMinimumStay(text) : null,
listing_history: history,
estimated_gross_yield: grossYield,
estimated_net_yield: null,
confidence: confidenceFor(priceText, address, markdown),
notes: buildNotes(candidate.platform, includeHistory, includeRentalYield, grossYield),
};
return row;
}
function rowFromFallback(candidate: Candidate, collectedAt: string, note: string): ListingRow {
const text = compactText(`${candidate.title ?? ""}\n${candidate.description ?? ""}`);
const priceText = extractAskingPrice(candidate.platform, {}, text);
const id = listingId(candidate.platform, candidate.url, {});
return {
tracking_key: `${candidate.platform}:${id ?? candidate.url}`,
snapshot_label: snapshotLabel,
collected_at: collectedAt,
platform: candidate.platform,
listing_id: id,
listing_url: candidate.url,
source_page_url: candidate.sourcePageUrl,
secondary_source_urls: candidate.secondarySourceUrls,
property_title: candidate.title,
address_text: null,
city: cityFilter || null,
region: regionFilter !== "global" ? regionFilter : null,
country: null,
property_type: propertyTypeFilter || null,
listing_status: extractStatus(candidate.platform, text),
asking_price_text: priceText,
asking_price_normalized: normalizeMoney(priceText),
currency: currencyFromText(priceText),
bedrooms: numberNear(text, /\b(\d+(?:\.\d+)?)\s*(?:bed|bedroom|bedrooms|beds)\b/i),
bathrooms: numberNear(text, /\b(\d+(?:\.\d+)?)\s*(?:bath|bathroom|bathrooms|baths)\b/i),
square_feet: numberNear(text, /\b([\d,]+)\s*(?:sq\.?\s*ft|square feet|sqft)\b/i),
lot_size_text: null,
days_on_market_text: extractDaysOnMarketText(text),
days_on_market_normalized: normalizeDays(extractDaysOnMarketText(text)),
price_cut_text: extractPriceCutText(text),
price_cut_amount: normalizeMoney(extractPriceCutText(text)),
estimated_rent_text: extractEstimatedRent(text),
nightly_rate_text: candidate.platform === "airbnb" ? extractNightlyRate(text) : null,
fees_text: null,
review_count: candidate.platform === "airbnb" ? extractReviewCount(text) : null,
rating_value: candidate.platform === "airbnb" ? extractRating(text) : null,
minimum_stay_text: candidate.platform === "airbnb" ? extractMinimumStay(text) : null,
listing_history: includeHistory ? extractHistory(text) : null,
estimated_gross_yield: null,
estimated_net_yield: null,
confidence: "low",
notes: note,
};
}
function extractTitle(candidate: Candidate, metadata: Record<string, unknown>): string | null {
const title = stringMeta(metadata, "og:title") ?? stringMeta(metadata, "ogTitle") ?? stringMeta(metadata, "twitter:title") ?? stringMeta(metadata, "title") ?? candidate.title;
if (!title) return null;
return cleanValue(title.replace(/\s+\|\s+(Zillow|Redfin).*$/i, "").replace(/\s+-\s+Airbnb.*$/i, ""));
}
function extractAddress(platform: Platform, metadata: Record<string, unknown>, title: string | null, text: string): string | null {
if (platform === "zillow") return stringMeta(metadata, "og:zillow_fb:address") ?? firstMatch(title ?? "", /^(.+?),\s*[^,]+,\s*[A-Z]{2}\s*\d{5}/);
if (platform === "redfin") {
const street = stringMeta(metadata, "twitter:text:street_address");
const city = stringMeta(metadata, "twitter:text:city");
const state = stringMeta(metadata, "twitter:text:state_code");
const zip = stringMeta(metadata, "twitter:text:zip");
if (street && city && state) return cleanValue(`${street}, ${city}, ${state}${zip ? ` ${zip}` : ""}`);
}
return firstMatch(title ?? text, /(\d{1,6}\s+[^|\n]+?,\s*[^,\n]+,\s*[A-Z]{2}\s*\d{5})/i);
}
function extractAskingPrice(platform: Platform, metadata: Record<string, unknown>, text: string): string | null {
if (platform === "redfin") return stringMeta(metadata, "twitter:text:price") ?? firstMatch(text, /For Sale:\s*[^$]{0,80}(\$[\d,.]+[KMB]?)/i) ?? firstMatch(text, /(\$[\d,.]+[KMB]?)\s*(?:\d+\s*beds?|\d+\s*baths?)/i);
if (platform === "zillow") return firstMatch(text, /Zillow has \d+ photos of this\s+(\$[\d,.]+[KMB]?)/i) ?? firstMatch(text, /For sale\.?\s*(?:Price cut:[^.]+?\.)?\s*(\$[\d,.]+[KMB]?)/i) ?? firstMatch(text, /(\$[\d,.]+[KMB]?)\s+\d+\s*beds?/i);
return null;
}
function extractEstimatedRent(text: string): string | null {
return firstMatch(text, /Rent Zestimate[^$]{0,80}(\$[\d,.]+[KMB]?\s*(?:\/mo|per month|monthly)?)/i)
?? firstMatch(text, /estimated rent[^$]{0,80}(\$[\d,.]+[KMB]?\s*(?:\/mo|per month|monthly)?)/i);
}
function extractNightlyRate(text: string): string | null {
return firstMatch(text, /(\$[\d,.]+[KMB]?\s*(?:per night|\/night|night))/i)
?? firstMatch(text, /(USD\s*[\d,.]+[KMB]?\s*(?:per night|\/night|night))/i);
}
function extractFees(text: string): string | null {
return firstMatch(text, /((?:cleaning|service|guest|extra guest|resort)\s+fee[^.\n]{0,120})/i);
}
function extractReviewCount(text: string): number | null {
return numberNear(text, /\b([\d,]+)\s+reviews?\b/i);
}
function extractRating(text: string): number | null {
return numberNear(text, /(?:Rated\s*)?(\d(?:\.\d{1,2})?)\s*(?:out of 5|★|stars?)/i);
}
function extractMinimumStay(text: string): string | null {
return firstMatch(text, /(minimum\s+\d+\s+night\s+stay)/i)
?? firstMatch(text, /(minimum stay[^.\n]{0,80})/i)
?? firstMatch(text, /(stays? of minimum\s+\d+\s+(?:night|nights|week|weeks|month|months))/i);
}
function extractPropertyType(platform: Platform, metadata: Record<string, unknown>, text: string): string | null {
const lowerTitle = `${stringMeta(metadata, "og:title") ?? ""} ${stringMeta(metadata, "description") ?? ""}`;
if (platform === "airbnb") return firstMatch(lowerTitle, /^([A-Za-z ]+?)\s+in\s+/) ?? propertyTypeFilterOrNull();
return firstMatch(lowerTitle, /\b(Condo|House|Townhouse|Apartment|Multi-family|Single Family|Villa)\s+home\b/i)
?? firstMatch(text, /\b(Condo|House|Townhouse|Apartment|Multi-family|Single Family|Villa)\s*Property Type\b/i)
?? firstMatch(text, /\b(Condo|House|Townhouse|Apartment|Multi-family|Single Family|Villa)\b.{0,30}\bProperty Type\b/i)
?? firstMatch(text, /\b(Property Type|Type)\s*[:\-]?\s*(Condo|House|Townhouse|Apartment|Multi-family|Single Family|Villa)\b/i, 2)
?? propertyTypeFilterOrNull();
}
function propertyTypeFilterOrNull(): string | null {
return propertyTypeFilter || null;
}
function extractStatus(platform: Platform, text: string): string | null {
if (platform === "airbnb") return "active short-term rental listing";
return firstMatch(text, /\b(For Sale|For Rent|Pending|Contingent|Sold|Off Market|Coming Soon)\b/i)
?? firstMatch(text, /\b(NEW\s+\d+\s+(?:HRS?|DAYS?)\s+AGO)\b/i);
}
function extractCity(platform: Platform, metadata: Record<string, unknown>, address: string | null, text: string): string | null {
if (platform === "redfin") return stringMeta(metadata, "twitter:text:city");
if (platform === "airbnb") return firstMatch(text, /(?:for Rent in|Rental unit in|Home in|Villa in)\s+([^,\n]+),/i);
if (address) {
const parts = address.split(",").map((part) => part.trim());
if (parts.length >= 2) return parts[parts.length - 2] ?? null;
}
return null;
}
function extractRegion(platform: Platform, metadata: Record<string, unknown>, address: string | null, text: string): string | null {
const geo = stringMeta(metadata, "geo.region");
if (geo) return geo;
if (platform === "redfin") return stringMeta(metadata, "twitter:text:state_code");
if (platform === "airbnb") return firstMatch(text, /,\s*([^,\n]+),\s*[^,\n]+(?:\s+-\s+Airbnb)?/i);
if (address) return firstMatch(address, /,\s*([A-Z]{2})\s+\d{5}/);
return null;
}
function extractCountry(platform: Platform, metadata: Record<string, unknown>, text: string): string | null {
const geo = stringMeta(metadata, "geo.region");
if (geo?.startsWith("US-")) return "US";
if (platform === "zillow" || platform === "redfin") return "US";
if (platform === "airbnb") return firstMatch(text, /,\s*([^,\n]+)\s+-\s+Airbnb/i) ?? firstMatch(text, /,\s*([^,\n]+)\s+Airbnb/i);
return null;
}
function extractDaysOnMarketText(text: string): string | null {
const clean = text.replace(/\*/g, " ");
return firstMatch(clean, /(?:^|[^\d])(\d{1,4}\s+days?\s+on\s+(?:Zillow|Redfin|market))/i)
?? firstMatch(clean, /(NEW\s+\d+\s+(?:HRS?|DAYS?)\s+AGO)/i);
}
function normalizeDays(text: string | null): number | null {
if (!text) return null;
const days = firstMatch(text, /(\d+)\s+days?/i);
if (days) return Number(days);
const hours = firstMatch(text, /(\d+)\s+hrs?/i);
if (hours) return 0;
return null;
}
function extractPriceCutText(text: string): string | null {
return firstMatch(text, /(Price cut:?\s*\$?[\d,.]+[KMB]?(?:\s*\([^)]+\))?)/i)
?? firstMatch(text, /(Price reduced[^.\n]{0,120})/i);
}
function extractHistory(text: string): string[] {
const pieces = text.split(/\n+/)
.map((line) => cleanValue(line.replace(/[*#|]/g, " ")))
.filter((line) => line && line.length <= 260);
const history = pieces.filter((line) => {
if (/^(?:-\s*)?(?:sale|tax|sale & tax) history$/i.test(line)) return false;
if (/^listed by\b/i.test(line)) return false;
return /\b(price cut|price changed|price change|listed|listed for rent|relisted|rental removed|sold|pending|contingent|listing updated|sale history|tax history)\b/i.test(line);
});
return Array.from(new Set(history)).slice(0, 12);
}
function computeGrossYield(price: number | null, monthlyRent: number | null, nightlyRate: number | null): number | null {
if (!price || price <= 0) return null;
if (monthlyRent && monthlyRent > 0) return round((monthlyRent * 12) / price, 4);
if (nightlyRate && nightlyRate > 0) return round((nightlyRate * 365) / price, 4);
return null;
}
function buildNotes(platform: Platform, history: boolean, rentalYield: boolean, grossYield: number | null): string | null {
const notes: string[] = [];
if (!history) notes.push("Listing history extraction was disabled.");
if (!rentalYield) notes.push("Rental-yield calculation was disabled.");
if (rentalYield && grossYield === null) notes.push("Yield is null because public purchase price and rental-rate signals were not both visible.");
if (platform === "airbnb") notes.push("Airbnb occupancy is not inferred; only visible nightly-rate, review, rating, fee, and host text is used.");
return notes.length ? notes.join(" ") : null;
}
function confidenceFor(priceText: string | null, address: string | null, markdown: string): "high" | "medium" | "low" {
if (priceText && address && markdown.length > 500) return "high";
if ((priceText || address) && markdown.length > 100) return "medium";
return "low";
}
function metadataText(metadata: Record<string, unknown>): string {
return Object.entries(metadata)
.filter(([key]) => /title|description|price|beds|baths|sqft|address|city|state|region|zestimate/i.test(key))
.map(([key, value]) => `${key}: ${String(value)}`)
.join("\n");
}
function htmlToText(html: string): string {
if (!html) return "";
const $ = cheerio.load(html);
$("script,style,noscript,svg").remove();
return $("body").text();
}
function stringMeta(metadata: Record<string, unknown>, key: string): string | null {
const value = metadata[key];
return typeof value === "string" && value.trim() ? cleanValue(value) : null;
}
function extractNumber(metadata: Record<string, unknown>, keys: string[]): number | null {
for (const key of keys) {
const value = stringMeta(metadata, key);
if (value) {
const normalized = Number(value.replace(/,/g, ""));
if (Number.isFinite(normalized)) return normalized;
}
}
return null;
}
function firstMatch(text: string, regex: RegExp, group = 1): string | null {
const match = text.match(regex);
return match?.[group] ? cleanValue(match[group]) : null;
}
function numberNear(text: string, regex: RegExp): number | null {
const value = firstMatch(text, regex);
if (!value) return null;
const normalized = Number(value.replace(/,/g, ""));
return Number.isFinite(normalized) ? normalized : null;
}
function normalizeMoney(text: string | null): number | null {
if (!text) return null;
const match = text.match(/(?:USD\s*)?([$])?\s*([\d,.]+)\s*([KMB])?/i);
if (!match) return null;
let value = Number(match[2].replace(/,/g, ""));
if (!Number.isFinite(value)) return null;
const suffix = match[3]?.toUpperCase();
if (suffix === "K") value *= 1_000;
if (suffix === "M") value *= 1_000_000;
if (suffix === "B") value *= 1_000_000_000;
return Math.round(value);
}
function currencyFromText(text: string | null): string | null {
if (!text) return null;
if (/\$|USD/i.test(text)) return "USD";
if (/IDR/i.test(text)) return "IDR";
if (/€|EUR/i.test(text)) return "EUR";
if (/£|GBP/i.test(text)) return "GBP";
return null;
}
function compactText(text: string): string {
return text.replace(/\s+/g, " ").trim();
}
function cleanValue(text: string): string {
return text.replace(/\s+/g, " ").replace(/&/g, "&").trim();
}
function round(value: number, digits: number): number {
const factor = 10 ** digits;
return Math.round(value * factor) / factor;
}
async function main() {
const collectedAt = new Date().toISOString();
const candidates = await discoverCandidates();
const seen = new Set<string>();
const rows: ListingRow[] = [];
for (const candidate of candidates) {
if (rows.length >= maxListings) break;
const key = `${candidate.platform}:${candidate.url}`;
if (seen.has(key)) continue;
seen.add(key);
rows.push(await scrapeCandidate(candidate, collectedAt));
}
const base = {
snapshot_label: snapshotLabel,
collected_at: collectedAt,
query,
region: regionFilter,
city: cityFilter,
property_type: propertyTypeFilter,
bedrooms: bedroomsFilter,
price_range: priceRangeFilter,
platforms,
output_mode: outputMode,
max_listings: maxListings,
listing_count: rows.length,
};
if (outputMode === "grouped_by_platform") {
const grouped = {} as Record<Platform, ListingRow[]>;
for (const platform of platforms) grouped[platform] = [];
for (const row of rows) grouped[row.platform].push(row);
process.stdout.write(JSON.stringify({ ...base, grouped_by_platform: grouped }));
return;
}
process.stdout.write(JSON.stringify({ ...base, listings: rows }));
}
main().catch((err) => {
console.error(err);
process.exit(1);
});
Deploy this collector to unlock schedules, the API endpoint, and destinations.