Competitor Integration Gap Finder collector facts

Publisher: bo-05 (@bo-05).

Version: 1. Last updated: 2026-07-03T09:26:07.674Z.

Run this collector on demand, as an API endpoint, or on a schedule with Firecrawl Prometheus.

Sample fields: notes, confidence, company_name, company_domain, source_page_url, integration_name, integration_category, integration_detail_url.

Parameters: seed-urls (string, required), max-companies (number), output-mode (string).

Competitor Integration Gap Finder

v1Published

Official company integration listings normalized into comparable company-integration rows or grouped company records.

Output & API

Preview the latest data, download it, or call this collector as an API.

Author's sample data
#notesconfidencecompany_namecompany_domainsource_page_urlintegration_nameintegration_categoryintegration_detail_url
0Official page listing; no category visible.highZapierzapier.comhttps://zapier.com/appsGoogle Sheetsnullhttps://zapier.com/apps/google-sheets/integrations
1Official page listing; no category visible.highZapierzapier.comhttps://zapier.com/appsGmailnullhttps://zapier.com/apps/gmail/integrations
2Official page listing; no category visible.highZapierzapier.comhttps://zapier.com/appsSlacknullhttps://zapier.com/apps/slack/integrations
Parameters
--seed-urlsstringrequiredComma-separated company homepages or official integration/app/marketplace/partner page URLs to inspect. e.g. "https://zapier.com/apps,https://slack.com/apps"
--max-companiesnumberMaximum number of companies from the seed URL list to process. The collector accepts 1 through 10 companies per run to keep request volume bounded. default 5
--output-modestringOutput shape: `integration_rows` for one row per company-integration pair, or `grouped_by_company` for nested integrations by company. default "integration_rows"

Marketplace

Publish this collector so others can deploy it — you keep ownership.

0 subscribers
bo-05@bo-05
0 runs in 14d · published 2d ago

Versions

Every build and self-heal appends a version. Pin one to lock runs to it.

managed by author
v1builtapprovedcurrent2d ago
How this script collects data
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { parseArgs } from "node:util";

const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
  console.error("FIRECRAWL_API_KEY is not set");
  process.exit(1);
}

const firecrawl = new Firecrawl({ apiKey });

const DEFAULT_MAX_COMPANIES = 5;
const MAX_COMPANIES = 10;
const MAX_DISCOVERY_LINKS_PER_SEED = 150;
const SEARCH_RESULT_LIMIT = 3;
const MAX_CANDIDATE_PAGES_PER_COMPANY = 4;
const MAX_EXTRACTION_LINKS_PER_PAGE = 500;
const MAX_INTEGRATIONS_PER_COMPANY = 100;

const { values: flags } = parseArgs({
  strict: true,
  options: {
    "seed-urls": { type: "string" },
    "max-companies": { type: "string" },
    "output-mode": { type: "string" },
  },
});

if (!flags["seed-urls"]) {
  console.error("--seed-urls is required");
  process.exit(1);
}

const maxCompanies = Number(flags["max-companies"] ?? String(DEFAULT_MAX_COMPANIES));
if (!Number.isInteger(maxCompanies) || maxCompanies < 1 || maxCompanies > MAX_COMPANIES) {
  console.error(`OUT_OF_SCOPE: --max-companies must be an integer between 1 and ${MAX_COMPANIES}`);
  process.exit(1);
}

const outputMode = flags["output-mode"] ?? "integration_rows";
if (outputMode !== "integration_rows" && outputMode !== "grouped_by_company") {
  console.error("OUT_OF_SCOPE: --output-mode must be integration_rows or grouped_by_company");
  process.exit(1);
}

type IntegrationRow = {
  company_name: string;
  company_domain: string;
  source_page_url: string | null;
  integration_name: string | null;
  integration_category: string | null;
  integration_detail_url: string | null;
  confidence: "high" | "medium" | "low";
  notes: string;
};

type Integration = {
  integration_name: string;
  integration_category: string | null;
  integration_detail_url: string | null;
  confidence: "high" | "medium" | "low";
  notes: string;
};

const seedUrls = String(flags["seed-urls"])
  .split(",")
  .map((value) => value.trim())
  .filter(Boolean)
  .slice(0, maxCompanies);

if (seedUrls.length === 0) {
  console.error("OUT_OF_SCOPE: --seed-urls must include at least one URL");
  process.exit(1);
}

const integrationPageTerms = [
  "integration",
  "integrations",
  "apps",
  "app",
  "marketplace",
  "partners",
  "partner",
  "connectors",
  "plugins",
  "extensions",
];

const categoryTerms = [
  "crm",
  "payments",
  "analytics",
  "support",
  "ecommerce",
  "automation",
  "productivity",
  "developer tools",
  "communication",
  "collaboration",
  "marketing",
  "sales",
  "finance",
  "data",
  "storage",
  "security",
  "hr",
  "project management",
];

const genericNames = new Set([
  "apps",
  "app",
  "integrations",
  "integration",
  "marketplace",
  "partners",
  "partner",
  "learn more",
  "read more",
  "view all",
  "see all",
  "get started",
  "contact us",
  "request demo",
  "pricing",
  "features",
  "solutions",
  "resources",
  "blog",
  "docs",
  "documentation",
  "api",
  "login",
  "sign in",
  "sign up",
  "book a demo",
]);

const nonIntegrationPathParts = new Set([
  "category",
  "categories",
  "collection",
  "collections",
  "templates",
  "template",
  "blog",
  "resources",
  "resource",
  "customers",
  "customer",
  "pricing",
  "features",
  "feature",
  "solutions",
  "solution",
  "industries",
  "industry",
  "docs",
  "documentation",
  "developers",
  "developer",
  "login",
  "signin",
  "signup",
  "contact",
  "deals",
  "l",
]);

function normalizeSeedUrl(input: string): URL {
  const withProtocol = /^https?:\/\//i.test(input) ? input : `https://${input}`;
  let parsed: URL;
  try {
    parsed = new URL(withProtocol);
  } catch {
    throw new Error(`OUT_OF_SCOPE: invalid URL "${input}"`);
  }
  if (!parsed.hostname.includes(".")) {
    throw new Error(`OUT_OF_SCOPE: invalid company domain "${input}"`);
  }
  parsed.hash = "";
  return parsed;
}

function cleanDomain(hostname: string): string {
  return hostname.toLowerCase().replace(/^www\./, "");
}

function sameOfficialDomain(candidate: string, domain: string): boolean {
  try {
    const host = cleanDomain(new URL(candidate).hostname);
    return host === domain || host.endsWith(`.${domain}`);
  } catch {
    return false;
  }
}

function absoluteUrl(href: string | undefined, baseUrl: string): string | null {
  if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
    return null;
  }
  try {
    const parsed = new URL(href, baseUrl);
    parsed.hash = "";
    return parsed.toString();
  } catch {
    return null;
  }
}

function compactText(value: string | undefined | null): string {
  return (value ?? "").replace(/\s+/g, " ").trim();
}

function titleCaseFromDomain(domain: string): string {
  const first = domain.split(".")[0] ?? domain;
  return first
    .split(/[-_]/)
    .filter(Boolean)
    .map((part) => part.charAt(0).toUpperCase() + part.slice(1))
    .join(" ");
}

function hasIntegrationSignal(value: string): boolean {
  const lower = value.toLowerCase();
  return integrationPageTerms.some((term) => lower.includes(term));
}

function canonicalUrl(value: string): string {
  const parsed = new URL(value);
  parsed.hash = "";
  if (parsed.pathname !== "/" && parsed.pathname.endsWith("/")) {
    parsed.pathname = parsed.pathname.slice(0, -1);
  }
  return parsed.toString();
}

function candidatePriority(url: string, seedUrl: string): number {
  const lower = url.toLowerCase();
  if (url === seedUrl) return 0;
  if (lower.includes("/integrations")) return 1;
  if (lower.includes("/apps") || lower.includes("/app-marketplace")) return 2;
  if (lower.includes("/marketplace")) return 3;
  if (lower.includes("/partners")) return 4;
  if (lower.includes("/connectors")) return 5;
  return 9;
}

async function scrapeHtml(url: string): Promise<string | null> {
  try {
    const doc: any = await firecrawl.scrape(url, {
      formats: ["html"],
      integration: "prometheus",
    });
    return typeof doc?.html === "string" ? doc.html : null;
  } catch (err) {
    console.error(`Skipping ${url}: ${err instanceof Error ? err.message : String(err)}`);
    return null;
  }
}

async function discoverCandidatePages(seed: URL, domain: string): Promise<string[]> {
  const seedUrl = canonicalUrl(seed.toString());
  const candidates = new Map<string, number>();

  function add(url: string | null, priority: number) {
    if (!url || !sameOfficialDomain(url, domain)) return;
    const canonical = canonicalUrl(url);
    const existing = candidates.get(canonical);
    if (existing === undefined || priority < existing) candidates.set(canonical, priority);
  }

  add(seedUrl, hasIntegrationSignal(seed.pathname) ? 0 : 8);

  for (const path of [
    "/integrations",
    "/apps",
    "/app-marketplace",
    "/marketplace",
    "/partners",
    "/connectors",
    "/product/integrations",
    "/solutions/integrations",
  ]) {
    add(`${seed.origin}${path}`, candidatePriority(`${seed.origin}${path}`, seedUrl));
  }

  const seedHtml = await scrapeHtml(seedUrl);
  if (seedHtml) {
    const $ = cheerio.load(seedHtml);
    $("a[href]").slice(0, MAX_DISCOVERY_LINKS_PER_SEED).each((_, element) => {
      const href = absoluteUrl($(element).attr("href"), seedUrl);
      const text = compactText($(element).text());
      const signal = `${href ?? ""} ${text}`;
      if (href && hasIntegrationSignal(signal)) add(href, candidatePriority(href, seedUrl));
    });
  }

  try {
    const searchResults: any = await firecrawl.search(
      `site:${domain} official integrations apps marketplace partners`,
      { limit: SEARCH_RESULT_LIMIT, integration: "prometheus" },
    );
    const results = Array.isArray(searchResults?.data) ? searchResults.data.slice(0, SEARCH_RESULT_LIMIT) : [];
    for (const result of results) {
      const url = typeof result?.url === "string" ? result.url : null;
      const title = compactText(result?.title);
      const description = compactText(result?.description);
      if (url && sameOfficialDomain(url, domain) && hasIntegrationSignal(`${url} ${title} ${description}`)) {
        add(url, candidatePriority(url, seedUrl));
      }
    }
  } catch (err) {
    console.error(`Search failed for ${domain}: ${err instanceof Error ? err.message : String(err)}`);
  }

  return [...candidates.entries()]
    .sort((a, b) => a[1] - b[1])
    .slice(0, MAX_CANDIDATE_PAGES_PER_COMPANY)
    .map(([url]) => url);
}

function visibleCategory(value: string): string | null {
  const lower = value.toLowerCase();
  const found = categoryTerms.find((term) => lower === term || lower.includes(term));
  if (!found) return null;
  return found
    .split(" ")
    .map((part) => part.charAt(0).toUpperCase() + part.slice(1))
    .join(" ");
}

function nearestCategory($: cheerio.CheerioAPI, element: cheerio.Element): string | null {
  const direct = visibleCategory(compactText($(element).closest("[data-category], [aria-label]").attr("data-category")));
  if (direct) return direct;

  const container = $(element).closest("article, li, div, section");
  const ownText = compactText(container.find(".category, [class*='category'], [class*='tag']").first().text());
  const ownCategory = visibleCategory(ownText);
  if (ownCategory) return ownCategory;

  const heading = container.prevAll("h2,h3,h4").first().text() || container.closest("section").find("h2,h3,h4").first().text();
  return visibleCategory(compactText(heading));
}

function likelyIntegrationName(text: string): boolean {
  const name = normalizeIntegrationName(text);
  const lower = name.toLowerCase();
  if (name.length < 2 || name.length > 70) return false;
  if (genericNames.has(lower)) return false;
  if (/^(view|see|learn|read|explore|browse|connect|install|try|become|submit|add)\b/i.test(name)) return false;
  if (name === name.toUpperCase() && name.length > 8) return false;
  if (/^\d[\d,]*\+?$/.test(name)) return false;
  if (/[{}<>]/.test(name)) return false;
  if (name.split(" ").length > 7) return false;
  return /[a-z0-9]/i.test(name);
}

function normalizeIntegrationName(text: string): string {
  let name = compactText(text);
  name = name.replace(/^logo of\s+/i, "").trim();
  name = name.replace(/\s*\d[\d,]*\+?$/, "").trim();
  name = name.replace(/\s+logo$/i, "").trim();
  name = name.replace(/\s+(integration|integrations|app|apps)$/i, "").trim();
  return name;
}

function likelyDetailUrl(href: string, pageUrl: string): boolean {
  const parsed = new URL(href);
  const page = new URL(pageUrl);
  const path = parsed.pathname.toLowerCase();
  if (canonicalUrl(href) === canonicalUrl(pageUrl)) return false;
  if (parsed.pathname === "/" || path === "/apps" || path === "/integrations" || path === "/marketplace") return false;

  const parts = path.split("/").filter(Boolean);
  if (parts.some((part) => nonIntegrationPathParts.has(part))) return false;

  const pagePath = page.pathname.toLowerCase();
  if (parts[0] === "apps") {
    return parts.length >= 3 && parts.includes("integrations");
  }
  if (parts[0] === "marketplace") {
    return parts.length >= 2 && /^a[a-z0-9]+-/i.test(parts[1]);
  }
  if (parts[0] === "integrations" || parts[0] === "integration") {
    return parts.length >= 2;
  }
  if (parts[0] === "connectors" || parts[0] === "connector") {
    return parts.length >= 2;
  }
  if (parts[0] === "partners" || parts[0] === "partner") {
    return parts.length >= 2;
  }

  return hasIntegrationSignal(path) && hasIntegrationSignal(pagePath) && parts.length >= 2;
}

function extractFromPage(html: string, pageUrl: string, domain: string): Integration[] {
  const $ = cheerio.load(html);
  const pageHasSignal = hasIntegrationSignal(pageUrl) || hasIntegrationSignal(compactText($("title").text()));
  const byName = new Map<string, Integration>();

  $("a[href]").slice(0, MAX_EXTRACTION_LINKS_PER_PAGE).each((_, element) => {
    const href = absoluteUrl($(element).attr("href"), pageUrl);
    if (!href || !sameOfficialDomain(href, domain)) return;
    if (!likelyDetailUrl(href, pageUrl)) return;

    const linkText = compactText($(element).text());
    const imageAlt = compactText($(element).find("img[alt]").first().attr("alt"));
    const ariaLabel = compactText($(element).attr("aria-label"));
    const rawName = [imageAlt, ariaLabel, linkText].find(likelyIntegrationName);
    const candidateName = rawName ? normalizeIntegrationName(rawName) : null;
    if (!candidateName) return;

    const pathHasSignal = hasIntegrationSignal(new URL(href).pathname);
    const containerText = compactText($(element).closest("article, li, div").text());
    const containerHasSignal = hasIntegrationSignal(containerText);
    if (!pageHasSignal && !pathHasSignal && !containerHasSignal) return;

    const key = candidateName.toLowerCase();
    const category = nearestCategory($, element);
    const confidence: "high" | "medium" = pathHasSignal || pageHasSignal ? "high" : "medium";
    const integration: Integration = {
      integration_name: candidateName,
      integration_category: category,
      integration_detail_url: href,
      confidence,
      notes: category ? "Official page listing with visible category signal." : "Official page listing; no category visible.",
    };

    const existing = byName.get(key);
    if (!existing || (existing.confidence === "medium" && integration.confidence === "high")) {
      byName.set(key, integration);
    }
  });

  return [...byName.values()].slice(0, MAX_INTEGRATIONS_PER_COMPANY);
}

async function collectCompany(seedUrlText: string): Promise<{ companyName: string; domain: string; sourcePageUrl: string | null; integrations: Integration[] }> {
  const seed = normalizeSeedUrl(seedUrlText);
  const domain = cleanDomain(seed.hostname);
  const candidates = await discoverCandidatePages(seed, domain);

  let companyName = titleCaseFromDomain(domain);
  let bestSource: string | null = null;
  let bestIntegrations: Integration[] = [];

  for (const candidate of candidates) {
    const html = await scrapeHtml(candidate);
    if (!html) continue;

    const integrations = extractFromPage(html, candidate, domain);

    if (integrations.length > bestIntegrations.length) {
      bestIntegrations = integrations;
      bestSource = candidate;
    }

    if (bestIntegrations.length >= 10 && hasIntegrationSignal(candidate)) break;
  }

  return { companyName, domain, sourcePageUrl: bestSource, integrations: bestIntegrations };
}

async function main() {
  const companies = [];
  for (const seedUrl of seedUrls) {
    companies.push(await collectCompany(seedUrl));
  }

  if (outputMode === "grouped_by_company") {
    const grouped = companies.map((company) => ({
      company_name: company.companyName,
      company_domain: company.domain,
      source_page_url: company.sourcePageUrl,
      integrations: company.integrations.map((integration) => ({
        integration_name: integration.integration_name,
        integration_category: integration.integration_category,
        integration_detail_url: integration.integration_detail_url,
        confidence: integration.confidence,
        notes: integration.notes,
      })),
    }));
    process.stdout.write(JSON.stringify(grouped));
    return;
  }

  const rows: IntegrationRow[] = companies.flatMap((company) => {
    if (company.integrations.length === 0) {
      return [
        {
          company_name: company.companyName,
          company_domain: company.domain,
          source_page_url: company.sourcePageUrl,
          integration_name: null,
          integration_category: null,
          integration_detail_url: null,
          confidence: "low",
          notes: "No public official integrations page with extractable integration listings was found.",
        },
      ];
    }

    return company.integrations.map((integration) => ({
      company_name: company.companyName,
      company_domain: company.domain,
      source_page_url: company.sourcePageUrl,
      integration_name: integration.integration_name,
      integration_category: integration.integration_category,
      integration_detail_url: integration.integration_detail_url,
      confidence: integration.confidence,
      notes: integration.notes,
    }));
  });

  process.stdout.write(JSON.stringify(rows));
}

main().catch((err) => {
  console.error(err);
  process.exit(1);
});
deploy to unlock

Deploy this collector to unlock schedules, the API endpoint, and destinations.

One person builds it. Everyone keeps it fresh.