Contact & Email Finder

Name: Contact & Email Finder Data Collector | Firecrawl Prometheus
Creator: sideguide
Published: 2026-06-13T21:50:42.152Z
License: https://opensource.org/licenses/MIT

v1Published

Extract public contact details from any company website — emails, phone numbers, and social profile links. Parameter: url.

Output & API

Preview the latest data, download it, or call this collector as an API.

Author's sample data

url	https://www.squarespace.com
counts
emails	[]
phones	[]
social
resolvedUrl	https://www.squarespace.com/
scrapedPages

Parameters

--urlstringrequiredThe company website domain or URL to scrape for contact details (e.g. example.com or https://www.example.com). e.g. "https://www.squarespace.com"

Marketplace

Publish this collector so others can deploy it — you keep ownership.

4 subscribers

sideguide@sideguide

0 runs in 14d · published 6w ago

Versions

Every build and self-heal appends a version. Pin one to lock runs to it.

managed by author

v1builtapprovedcurrent6w ago

How this script collects data

import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";

// ---------------------------------------------------------------------------
// CLI parameters
// ---------------------------------------------------------------------------
const { values } = parseArgs({
  strict: true,
  options: {
    url: { type: "string" },
  },
});

const rawInput = (values.url ?? "").trim();
if (!rawInput) {
  console.error("Missing required parameter --url=<domain-or-url>");
  process.exit(1);
}

// Normalise the input into an absolute http(s) URL.
function normalizeUrl(input: string): string {
  let candidate = input;
  if (!/^https?:\/\//i.test(candidate)) {
    candidate = "https://" + candidate.replace(/^\/+/, "");
  }
  let parsed: URL;
  try {
    parsed = new URL(candidate);
  } catch {
    throw new Error(`OUT_OF_SCOPE: "${input}" is not a valid domain or URL`);
  }
  if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
    throw new Error(`OUT_OF_SCOPE: only http(s) URLs are supported, got "${input}"`);
  }
  // A bare hostname must contain a dot (e.g. example.com) to be a real site.
  if (!parsed.hostname.includes(".")) {
    throw new Error(`OUT_OF_SCOPE: "${input}" does not look like a public website domain`);
  }
  return parsed.toString();
}

const firecrawl = new Firecrawl({ apiKey: process.env.FIRECRAWL_API_KEY });

const EMAIL_RE = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g;
const ASSET_EXT_RE = /\.(png|jpe?g|gif|svg|webp|ico|css|js|woff2?|ttf|eot|mp4|pdf)$/i;

const SOCIAL_MATCHERS: { platform: keyof SocialLinks; host: RegExp }[] = [
  { platform: "linkedin", host: /(^|\.)linkedin\.com$/i },
  { platform: "twitter", host: /(^|\.)(twitter|x)\.com$/i },
  { platform: "facebook", host: /(^|\.)(facebook|fb)\.com$/i },
  { platform: "instagram", host: /(^|\.)instagram\.com$/i },
];

// Generic / non-profile social paths we never want to report as a profile.
const SOCIAL_JUNK_RE = /\/(sharer|share|intent|dialog|plugins|tr|home|login|signup|hashtag|search)\b/i;

interface SocialLinks {
  linkedin: string[];
  twitter: string[];
  facebook: string[];
  instagram: string[];
}

function cleanEmail(raw: string): string | null {
  const email = raw.trim().replace(/[).,;:'"<>]+$/, "").toLowerCase();
  if (!/^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$/.test(email)) return null;
  if (ASSET_EXT_RE.test(email)) return null;
  if (email.includes("..")) return null;
  // Drop obvious placeholders / tracking noise.
  if (/(example|sentry|wixpress|\.png|\.jpg|domain)\b/i.test(email)) return null;
  return email;
}

// Normalise a phone candidate to its dialable form; return null if implausible.
function cleanPhone(raw: string): string | null {
  const trimmed = raw.trim();
  const hasPlus = /^\+/.test(trimmed.replace(/^[\s(]*/, ""));
  const digits = trimmed.replace(/[^\d]/g, "");
  if (digits.length < 7 || digits.length > 15) return null;
  return (hasPlus ? "+" : "") + digits;
}

function hostnameOf(link: string): string | null {
  try {
    return new URL(link).hostname.toLowerCase();
  } catch {
    return null;
  }
}

interface Accumulator {
  emails: Set<string>;
  phones: Map<string, string>; // normalized -> display
  social: Record<keyof SocialLinks, Set<string>>;
}

function classifySocial(link: string, acc: Accumulator) {
  const host = hostnameOf(link);
  if (!host) return;
  if (SOCIAL_JUNK_RE.test(link)) return;
  for (const m of SOCIAL_MATCHERS) {
    if (m.host.test(host)) {
      // Require an actual path segment beyond the bare domain for a profile.
      let path = "/";
      try {
        path = new URL(link).pathname;
      } catch {
        return;
      }
      if (path.replace(/\/+$/, "").length === 0) return;
      acc.social[m.platform].add(link.split("#")[0].replace(/\/+$/, ""));
      return;
    }
  }
}

// Extract all contact signals from one scraped document into the accumulator.
function harvest(rawHtml: string, links: string[], acc: Accumulator) {
  const $ = cheerio.load(rawHtml);

  // Anchor hrefs: mailto, tel, and social profiles.
  $("a[href]").each((_, el) => {
    const href = ($(el).attr("href") || "").trim();
    if (/^mailto:/i.test(href)) {
      const addr = href.replace(/^mailto:/i, "").split("?")[0];
      const c = cleanEmail(addr);
      if (c) acc.emails.add(c);
    } else if (/^tel:/i.test(href)) {
      const c = cleanPhone(href.replace(/^tel:/i, ""));
      if (c) acc.phones.set(c, c);
    } else if (/^https?:/i.test(href)) {
      classifySocial(href, acc);
    }
  });

  // Social profiles surfaced in Firecrawl's discovered link list.
  for (const link of links) classifySocial(link, acc);

  // Visible text: emails and phone numbers written out in the page body.
  $("script, style, noscript").remove();
  const text = $("body").text().replace(/\s+/g, " ");

  const emailMatches = text.match(EMAIL_RE) || [];
  for (const m of emailMatches) {
    const c = cleanEmail(m);
    if (c) acc.emails.add(c);
  }

  // Phone numbers in text. To avoid false positives (years, prices, IDs that
  // are merely space-separated digit runs), a text candidate must carry an
  // explicit telephone separator: a leading "+", parentheses, or dot/dash
  // separated digit groups.
  const phoneRe =
    /(?:\+\d[\d\s().\-]{6,}\d)|(?:\(\d{2,4}\)[\d\s().\-]{4,}\d)|(?:\d{2,4}[.\-]\d{2,4}[.\-]\d{2,4}(?:[.\-]\d{1,4})?)/g;
  const phoneMatches = text.match(phoneRe) || [];
  for (const m of phoneMatches) {
    const c = cleanPhone(m);
    if (c && !acc.phones.has(c)) acc.phones.set(c, c);
  }
}

async function scrapePage(url: string): Promise<{ rawHtml: string; links: string[] } | null> {
  try {
    const doc: any = await firecrawl.scrape(url, {
      formats: ["rawHtml", "links"],
      onlyMainContent: false,
      integration: "prometheus",
    } as any);
    return { rawHtml: doc?.rawHtml || "", links: Array.isArray(doc?.links) ? doc.links : [] };
  } catch (e) {
    console.error(`Failed to scrape ${url}: ${(e as Error).message}`);
    return null;
  }
}

async function main() {
  const startUrl = normalizeUrl(rawInput);
  const startHost = hostnameOf(startUrl);

  const acc: Accumulator = {
    emails: new Set(),
    phones: new Map(),
    social: { linkedin: new Set(), twitter: new Set(), facebook: new Set(), instagram: new Set() },
  };

  const home = await scrapePage(startUrl);
  if (!home) {
    throw new Error(`could not scrape the homepage at ${startUrl}`);
  }
  const scrapedPages: string[] = [startUrl];
  harvest(home.rawHtml, home.links, acc);

  // Find up to 4 same-site contact/about pages to scrape for more details.
  const CONTACT_RE = /(contact|about|team|imprint|impressum|legal|support|company)/i;
  const candidates: string[] = [];
  const seen = new Set([startUrl.replace(/\/+$/, "")]);
  for (const link of home.links) {
    if (candidates.length >= 4) break;
    const host = hostnameOf(link);
    if (!host || host !== startHost) continue;
    if (!CONTACT_RE.test(link)) continue;
    const norm = link.split("#")[0].replace(/\/+$/, "");
    if (seen.has(norm)) continue;
    seen.add(norm);
    candidates.push(link);
  }

  for (const url of candidates) {
    const page = await scrapePage(url);
    if (!page) continue;
    scrapedPages.push(url);
    harvest(page.rawHtml, page.links, acc);
  }

  const out = {
    url: rawInput,
    resolvedUrl: startUrl,
    scrapedPages,
    emails: [...acc.emails].sort(),
    phones: [...acc.phones.values()].sort(),
    social: {
      linkedin: [...acc.social.linkedin].sort(),
      twitter: [...acc.social.twitter].sort(),
      facebook: [...acc.social.facebook].sort(),
      instagram: [...acc.social.instagram].sort(),
    },
    counts: {
      emails: acc.emails.size,
      phones: acc.phones.size,
      social:
        acc.social.linkedin.size +
        acc.social.twitter.size +
        acc.social.facebook.size +
        acc.social.instagram.size,
    },
  };

  process.stdout.write(JSON.stringify(out));
}

main().catch((e) => {
  console.error(e instanceof Error ? e.message : String(e));
  process.exit(1);
});

deploy to unlock

Deploy this collector to unlock schedules, the API endpoint, and destinations.