diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment-task.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment-task.ts index 78210a5138..23abf82887 100644 --- a/apps/api/src/trigger/vendor/vendor-risk-assessment-task.ts +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment-task.ts @@ -476,7 +476,9 @@ export const vendorRiskAssessmentTask: Task< minTimeoutInMs: 1000, maxTimeoutInMs: 10000, }, - maxDuration: 1000 * 60 * 10, + // 30 minutes total: Firecrawl Agent can take up to 25 min on slow SPA + // trust centers (Ubiquiti), and deep-scrape + DB writes need room too. + maxDuration: 1000 * 60 * 30, run: async (payload) => { await tags.add([`org:${payload.organizationId}`]); @@ -1061,7 +1063,19 @@ export const vendorRiskAssessmentTask: Task< badgeCount: Array.isArray(complianceBadges) ? complianceBadges.length : 0, + complianceBadgesJson: JSON.stringify(complianceBadges ?? null), hasLogo: Boolean(logoUrl), + certificationsInAssessmentJson: JSON.stringify( + Array.isArray( + (coreData as { certifications?: unknown })?.certifications, + ) + ? ( + coreData as { + certifications?: Array<{ type: string; status: string }>; + } + ).certifications + : [], + ), }); // Update vendor with core data (keep status in_progress — news may still be loading) diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/deep-scrape-source-url.spec.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/deep-scrape-source-url.spec.ts new file mode 100644 index 0000000000..c6dbc19ff6 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/deep-scrape-source-url.spec.ts @@ -0,0 +1,147 @@ +import type { VendorRiskAssessmentCertification } from './agent-types'; +import { pickDeepScrapeSourceUrl } from './deep-scrape-source-url'; + +const cert = ( + overrides: Partial = {}, +): VendorRiskAssessmentCertification => ({ + type: 'SOC 2 Type II', + status: 'verified', + issuedAt: null, + expiresAt: null, + url: null, + ...overrides, +}); + +describe('pickDeepScrapeSourceUrl', () => { + const vendorDomain = 'acme.com'; + + it("prefers 'Trust & Security' link over 'Security Overview'", () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [ + { label: 'Security Overview', url: 'https://acme.com/security' }, + { label: 'Trust & Security', url: 'https://acme.com/trust' }, + ], + certifications: [], + }); + expect(result).toBe('https://acme.com/trust'); + }); + + it("falls back to 'Security Overview' when no 'Trust & Security' link", () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [{ label: 'Security Overview', url: 'https://acme.com/security' }], + certifications: [], + }); + expect(result).toBe('https://acme.com/security'); + }); + + it('falls back to a verified cert URL on the vendor domain when no labelled links match', () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [], + certifications: [ + cert({ url: 'https://acme.com/reports/soc2.pdf', status: 'verified' }), + ], + }); + expect(result).toBe('https://acme.com/reports/soc2.pdf'); + }); + + it('skips subdomain-matching cert URL when status is not verified', () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [], + certifications: [ + cert({ url: 'https://trust.acme.com/iso', status: 'unknown' }), + ], + }); + expect(result).toBeNull(); + }); + + it('accepts subdomain-matching cert URL (same registrable domain)', () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [], + certifications: [ + cert({ url: 'https://trust.acme.com/iso', status: 'verified' }), + ], + }); + expect(result).toBe('https://trust.acme.com/iso'); + }); + + it('rejects off-domain labelled links', () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [ + { label: 'Trust & Security', url: 'https://acme.trust.page' }, + ], + certifications: [], + }); + expect(result).toBeNull(); + }); + + it('rejects off-domain verified cert URL', () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [], + certifications: [ + cert({ url: 'https://acme.safebase.io/soc2', status: 'verified' }), + ], + }); + expect(result).toBeNull(); + }); + + it('rejects unparseable URLs', () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [{ label: 'Trust & Security', url: 'not a url' }], + certifications: [cert({ url: 'also not a url', status: 'verified' })], + }); + expect(result).toBeNull(); + }); + + it('returns null when everything is empty', () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [], + certifications: [], + }); + expect(result).toBeNull(); + }); + + it('returns first verified cert URL and ignores later verified certs', () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [], + certifications: [ + cert({ + type: 'SOC 2', + status: 'verified', + url: 'https://acme.com/first.pdf', + }), + cert({ + type: 'ISO 27001', + status: 'verified', + url: 'https://acme.com/second.pdf', + }), + ], + }); + expect(result).toBe('https://acme.com/first.pdf'); + }); + + it('skips verified certs whose URL is null and continues to next cert', () => { + const result = pickDeepScrapeSourceUrl({ + vendorDomain, + links: [], + certifications: [ + cert({ type: 'SOC 2', status: 'verified', url: null }), + cert({ + type: 'ISO 27001', + status: 'verified', + url: 'https://acme.com/iso.pdf', + }), + ], + }); + expect(result).toBe('https://acme.com/iso.pdf'); + }); +}); diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/deep-scrape-source-url.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/deep-scrape-source-url.ts new file mode 100644 index 0000000000..38fd670c81 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/deep-scrape-source-url.ts @@ -0,0 +1,47 @@ +import type { VendorRiskAssessmentCertification } from './agent-types'; + +/** + * Resolve the best "source URL" to feed into `deepScrapeTrustPortal`. + * + * Fallback order: + * 1. The Agent-returned link labelled "Trust & Security" if it's on the vendor's domain. + * 2. The Agent-returned link labelled "Security Overview" if it's on the vendor's domain. + * 3. The URL of any verified certification that's on the vendor's domain. + * + * Returns null if nothing qualifies. Off-domain URLs are rejected at every tier — + * `deepScrapeTrustPortal` applies an additional third-party-portal gate, but + * this helper is the first line of defense against scraping an unrelated host. + */ +export function pickDeepScrapeSourceUrl(args: { + vendorDomain: string; + links: Array<{ label: string; url: string }>; + certifications: VendorRiskAssessmentCertification[]; +}): string | null { + const { vendorDomain, links, certifications } = args; + + const isOnVendorDomain = (url: string): boolean => { + try { + const host = new URL(url).hostname.toLowerCase(); + return host === vendorDomain || host.endsWith(`.${vendorDomain}`); + } catch { + return false; + } + }; + + const byLabel = (label: string) => + links.find((l) => l.label === label && isOnVendorDomain(l.url))?.url ?? + null; + + const trustUrl = byLabel('Trust & Security'); + if (trustUrl) return trustUrl; + + const securityUrl = byLabel('Security Overview'); + if (securityUrl) return securityUrl; + + for (const cert of certifications) { + if (cert.status !== 'verified') continue; + if (cert.url && isOnVendorDomain(cert.url)) return cert.url; + } + + return null; +} diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-core.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-core.ts index cb92637fe3..93683039e3 100644 --- a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-core.ts +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-core.ts @@ -1,7 +1,10 @@ // apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-core.ts import { logger } from '@trigger.dev/sdk'; import { vendorRiskAssessmentAgentSchema } from './agent-schema'; -import type { VendorRiskAssessmentDataV1 } from './agent-types'; +import type { + VendorRiskAssessmentCertification, + VendorRiskAssessmentDataV1, +} from './agent-types'; import { validateVendorUrl } from './url-validation'; import { type FirecrawlSetup, @@ -9,6 +12,16 @@ import { normalizeIso, setupFirecrawlClient, } from './firecrawl-agent-shared'; +import { deepScrapeTrustPortal } from './trust-portal-deep-scrape'; +import { mergeCertifications } from './trust-portal-deep-scrape-merge'; +import { pickDeepScrapeSourceUrl } from './deep-scrape-source-url'; +import { firecrawlAgentJsonSchema } from './firecrawl-agent-schema-json'; +import { buildFirecrawlAgentPrompt } from './firecrawl-agent-prompt'; +import { + asRecord, + countPopulatedAgentFields, + extractAgentPayloadCandidates, +} from './firecrawl-agent-payload'; export async function firecrawlResearchCore(params: { vendorName: string; @@ -20,125 +33,32 @@ export async function firecrawlResearchCore(params: { const { firecrawlClient, vendorDomain, seedUrls } = setup; const { vendorName, vendorWebsite } = params; - const prompt = `Complete cyber security research on the vendor "${vendorName}" with website ${vendorWebsite}. - -Extract the following information: - -1. **Certifications**: Find all security and compliance certifications. For each one found, determine: - - The type of certification (SOC 2 Type I, SOC 2 Type II, ISO 27001, ISO 27017, ISO 27018, ISO 27701, ISO 42001, FedRAMP, HIPAA, PCI DSS, GDPR, TISAX, CSA STAR, C5, SOC 1, SOC 3, etc.) - - Whether it's currently active/verified, expired, or not certified - - Any issue or expiry dates mentioned - - Direct link to the certification report or trust page - -2. **Security & Legal Links**: Find the direct URLs to these pages. IMPORTANT: Many vendors host their trust portal on a third-party platform (e.g., SafeBase at trust.page, Vanta, Drata, Whistic). Prefer the actual trust portal where customers can request security reports over documentation pages that just describe compliance processes. - - **Trust Center / Security Portal**: The page where customers can review security posture and request compliance reports. This is NOT the docs page about security — it's the dedicated trust portal. Look for links labeled "Trust Center", "Security", "Trust Portal" in the site navigation or footer. It may be hosted on a subdomain (trust.${vendorDomain}, security.${vendorDomain}) or a third-party domain (e.g., ${vendorName.toLowerCase()}.trust.page, ${vendorName.toLowerCase()}.safebase.io). TIP: Try searching "${vendorName} trust portal" or "${vendorName} security trust center" to find it if not immediately visible on the site. - - **Privacy Policy**: Usually at /privacy or /privacy-policy - - **Terms of Service**: Usually at /terms or /tos - - **Security Overview**: A page describing security practices (this CAN be a docs page) - - **SOC 2 Report**: Direct link to request or download the SOC 2 report - -3. **Summary**: Provide an overall assessment of the vendor's security posture based on your findings. - -Focus on the official website ${vendorWebsite} and its trust/security/compliance pages.`; + const prompt = buildFirecrawlAgentPrompt({ + vendorName, + vendorWebsite, + vendorDomain, + }); - let agentResponse; - try { - agentResponse = await firecrawlClient.agent({ + const runCoreAgent = async (urls: string[]) => + firecrawlClient.agent({ prompt, - urls: seedUrls, + urls, strictConstrainToURLs: false, - maxCredits: 2500, - timeout: 360, + maxCredits: 4000, + // SDK polls this long before returning whatever status it has. 360s + // wasn't enough for slow SPA trust centers (Ubiquiti) — SDK returned + // "processing" and we silently parsed empty data. 25 min gives the + // agent plenty of room; the new status check also ensures we surface + // timeouts instead of pretending success. + timeout: 1500, pollInterval: 5, ...({ model: 'spark-1-pro' } as Record), // SDK types lag behind API — model is supported but not typed yet - schema: { - type: 'object', - properties: { - risk_level: { - type: 'string', - description: - 'Overall vendor risk level: critical, high, medium, low, or very_low', - }, - security_assessment: { - type: 'string', - description: - 'A detailed paragraph summarizing the vendor security posture, including strengths, weaknesses, and key findings', - }, - last_researched_at: { - type: 'string', - description: 'ISO 8601 date of when this research was conducted', - }, - certifications: { - type: 'array', - description: - 'All security and compliance certifications found on the vendor website', - items: { - type: 'object', - properties: { - type: { - type: 'string', - description: - 'Certification name, e.g. SOC 2 Type II, ISO 27001, FedRAMP, HIPAA, PCI DSS, GDPR, ISO 42001, ISO 27017, ISO 27018, TISAX, CSA STAR, C5, etc.', - }, - status: { - type: 'string', - enum: ['verified', 'expired', 'not_certified', 'unknown'], - description: - 'Whether the certification is currently active/verified, expired, not certified, or unknown', - }, - issued_at: { - type: 'string', - description: - 'ISO 8601 date when the certification was issued, if mentioned', - }, - expires_at: { - type: 'string', - description: - 'ISO 8601 date when the certification expires, if mentioned', - }, - url: { - type: 'string', - description: - 'Direct URL to the certification report or trust page on the vendor domain', - }, - }, - required: ['type'], - }, - }, - links: { - type: 'object', - description: - 'Direct URLs to key legal and security pages on the vendor domain', - properties: { - privacy_policy_url: { - type: 'string', - description: 'Direct URL to the privacy policy page', - }, - terms_of_service_url: { - type: 'string', - description: 'Direct URL to the terms of service page', - }, - trust_center_url: { - type: 'string', - description: - 'Direct URL to the trust portal where customers can review security posture and request reports. Prefer the dedicated trust portal (often on trust.page, safebase.io, vanta.com, or a trust. subdomain) over documentation pages.', - }, - security_page_url: { - type: 'string', - description: - 'Direct URL to the security overview or security practices page', - }, - soc2_report_url: { - type: 'string', - description: - 'Direct URL to request or download the SOC 2 report', - }, - }, - }, - }, - required: ['security_assessment'], - }, + schema: firecrawlAgentJsonSchema, }); + + let agentResponse; + try { + agentResponse = await runCoreAgent(seedUrls); } catch (error) { return handleFirecrawlError(error, { vendorName, @@ -147,23 +67,98 @@ Focus on the official website ${vendorWebsite} and its trust/security/compliance }); } - if (!agentResponse.success || agentResponse.status === 'failed') { + const responseErrorMessage = + typeof agentResponse.error === 'string' + ? agentResponse.error + : String(agentResponse.error ?? ''); + const shouldRetryFetchFailed = + agentResponse.status === 'failed' && + /fetch failed/i.test(responseErrorMessage); + + if (shouldRetryFetchFailed) { + const retryUrls = Array.from( + new Set([ + ...seedUrls, + `https://${vendorDomain}`, + `https://${vendorDomain}/trust-center`, + `https://${vendorDomain}/trust-center#cloud-security`, + `https://www.${vendorDomain}`, + `https://www.${vendorDomain}/trust-center`, + `https://www.${vendorDomain}/trust-center#cloud-security`, + ]), + ); + + logger.warn('Firecrawl core research fetch failed; retrying once', { + vendorWebsite, + originalStatus: agentResponse.status, + originalError: responseErrorMessage, + retryUrlCount: retryUrls.length, + }); + + try { + agentResponse = await runCoreAgent(retryUrls); + } catch (error) { + return handleFirecrawlError(error, { + vendorName, + vendorWebsite, + callType: 'core_retry', + }); + } + } + + if (!agentResponse.success || agentResponse.status !== 'completed') { + const isProcessing = agentResponse.status === 'processing'; logger.warn('Firecrawl core research job did not complete successfully', { vendorWebsite, status: agentResponse.status, + success: agentResponse.success, error: agentResponse.error, + // Full raw response only on the exceptional path — on happy path + // the parsed data is already surfaced by the snapshot log below. + agentResponseJson: JSON.stringify(agentResponse).slice(0, 4000), + note: isProcessing + ? 'SDK returned while the agent job is still running on Firecrawl. Bump timeout, or poll with getAgentStatus.' + : undefined, }); return null; } - const parsed = vendorRiskAssessmentAgentSchema.safeParse(agentResponse.data); - if (!parsed.success) { + const payloadCandidates = extractAgentPayloadCandidates(agentResponse); + const parseAttempts = payloadCandidates.map((candidate) => ({ + candidate, + result: vendorRiskAssessmentAgentSchema.safeParse(candidate), + })); + // Pick the candidate that parsed successfully AND populated the most + // fields. Every schema field is optional, so the outer wrapper parses + // as {} and would otherwise win over the nested `.data` payload — which + // is exactly what was dropping real agent output on the floor. + const successfulAttempts = parseAttempts.filter((a) => a.result.success); + const parsedAttempt = successfulAttempts.reduce< + (typeof successfulAttempts)[number] | null + >((best, curr) => { + if (!curr.result.success) return best; + if (!best || !best.result.success) return curr; + return countPopulatedAgentFields(curr.result.data) > + countPopulatedAgentFields(best.result.data) + ? curr + : best; + }, null); + + if (!parsedAttempt || !parsedAttempt.result.success) { + const responseRecord = asRecord(agentResponse); + const firstAttempt = parseAttempts[0]?.result; + const primaryIssues = + firstAttempt && !firstAttempt.success ? firstAttempt.error.issues : []; + logger.warn('Firecrawl core research returned invalid data shape', { vendorWebsite, - issues: parsed.error.issues, + issues: primaryIssues, + payloadCandidateCount: payloadCandidates.length, + responseKeys: responseRecord ? Object.keys(responseRecord) : [], }); return null; } + const parsed = parsedAttempt.result; const links = parsed.data.links ?? null; const linkPairs: Array<{ label: string; url: string }> = []; @@ -200,11 +195,88 @@ Focus on the official website ${vendorWebsite} and its trust/security/compliance url: validateVendorUrl(c.url ?? null, vendorDomain, `cert:${c.type}`), })) ?? []; + logger.info('Firecrawl Agent returned — pre-deep-scrape snapshot', { + vendorWebsite, + normalizedLinksJson: JSON.stringify(normalizedLinks), + agentCertificationsJson: JSON.stringify( + certifications.map((c) => ({ + type: c.type, + status: c.status, + })), + ), + verifiedAgentCertCount: certifications.filter( + (c) => c.status === 'verified', + ).length, + agentRiskLevel: parsed.data.risk_level ?? null, + }); + + const deepScrapeSourceUrl = pickDeepScrapeSourceUrl({ + vendorDomain, + links: normalizedLinks, + certifications, + }); + + let mergedCertifications: VendorRiskAssessmentCertification[] = + certifications; + if (deepScrapeSourceUrl) { + logger.info('Trust portal deep-scrape: source URL resolved', { + vendorWebsite, + vendorDomain, + sourceUrl: deepScrapeSourceUrl, + }); + const deepCerts = await deepScrapeTrustPortal({ + vendorName, + vendorDomain, + sourceUrl: deepScrapeSourceUrl, + firecrawlClient, + }); + if (deepCerts && deepCerts.length > 0) { + mergedCertifications = mergeCertifications(certifications, deepCerts); + logger.info('Trust portal deep-scrape merged into core certifications', { + vendorWebsite, + coreCount: certifications.length, + deepCount: deepCerts.length, + mergedCount: mergedCertifications.length, + mergedTypesJson: JSON.stringify( + mergedCertifications.map((c) => ({ + type: c.type, + status: c.status, + })), + ), + }); + } else { + logger.info( + 'Trust portal deep-scrape returned no certifications — keeping Agent result', + { + vendorWebsite, + deepReturnedNull: deepCerts === null, + deepReturnedEmpty: Array.isArray(deepCerts) && deepCerts.length === 0, + }, + ); + } + } else { + logger.info( + 'Trust portal deep-scrape skipped: pickDeepScrapeSourceUrl found no usable URL on vendor domain', + { + vendorWebsite, + vendorDomain, + availableLinksJson: JSON.stringify( + normalizedLinks.map((l) => ({ label: l.label, url: l.url })), + ), + verifiedCertsWithUrlsJson: JSON.stringify( + certifications + .filter((c) => c.status === 'verified' && c.url) + .map((c) => ({ type: c.type, url: c.url })), + ), + }, + ); + } + logger.info('Firecrawl core research completed', { vendorWebsite, found: { links: normalizedLinks.length, - certifications: certifications.length, + certifications: mergedCertifications.length, }, }); @@ -217,7 +289,8 @@ Focus on the official website ${vendorWebsite} and its trust/security/compliance new Date().toISOString(), riskLevel: parsed.data.risk_level ?? null, securityAssessment: parsed.data.security_assessment ?? null, - certifications: certifications.length > 0 ? certifications : null, + certifications: + mergedCertifications.length > 0 ? mergedCertifications : null, links: normalizedLinks.length > 0 ? normalizedLinks : null, }; } diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-news.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-news.ts index 56b7154dcc..3fef48adb6 100644 --- a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-news.ts +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-news.ts @@ -81,7 +81,11 @@ Search the company's blog, newsroom, press releases, and reputable tech news sou urls: [origin, `${origin}/blog`, `${origin}/newsroom`, `${origin}/press`], strictConstrainToURLs: false, maxCredits: 2500, - timeout: 360, + // SDK polls this long before returning whatever status it has. + // Matches core agent timeout (25 min) — news agent was hitting 360s + // for slow vendor sites and silently returning processing state as + // "no news items." + timeout: 1500, pollInterval: 5, ...({ model: 'spark-1-pro' } as Record), schema: newsResponseSchema, @@ -94,11 +98,18 @@ Search the company's blog, newsroom, press releases, and reputable tech news sou }); } - if (!agentResponse.success || agentResponse.status === 'failed') { + if (!agentResponse.success || agentResponse.status !== 'completed') { + const isProcessing = agentResponse.status === 'processing'; logger.warn('Firecrawl news research job did not complete successfully', { vendorWebsite, status: agentResponse.status, + success: agentResponse.success, error: agentResponse.error, + // Full raw response only on the exceptional path. + agentResponseJson: JSON.stringify(agentResponse).slice(0, 4000), + note: isProcessing + ? 'SDK returned while the news agent job is still running on Firecrawl. Bump timeout, or poll with getAgentStatus.' + : undefined, }); return null; } @@ -110,6 +121,12 @@ Search the company's blog, newsroom, press releases, and reputable tech news sou if (!Array.isArray(rawNews) || rawNews.length === 0) { logger.info('Firecrawl news research returned no news items', { vendorWebsite, + agentDataKeys: data ? Object.keys(data) : [], + rawNewsType: Array.isArray(rawNews) + ? 'empty-array' + : rawNews === undefined + ? 'undefined' + : typeof rawNews, }); return null; } diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-payload.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-payload.ts new file mode 100644 index 0000000000..0baf0ed694 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-payload.ts @@ -0,0 +1,58 @@ +/** + * Helpers for extracting the actual structured payload out of a + * Firecrawl Agent response. The SDK wraps data under `.data`, but across + * versions it has shown up under `.output`, `.result`, or `.response` too. + * + * Because every field in `vendorRiskAssessmentAgentSchema` is optional, + * parsing the outer wrapper object against the schema succeeds as an + * empty `{}` — which would silently beat the populated inner `.data` + * payload under a `.find(ok)` lookup. Callers must score candidates by + * populated-field count and pick the best, not the first. + */ + +export function asRecord(value: unknown): Record | null { + return value && typeof value === 'object' + ? (value as Record) + : null; +} + +export function extractAgentPayloadCandidates( + agentResponse: unknown, +): unknown[] { + const candidates: unknown[] = []; + const seen = new Set(); + + const visit = (value: unknown) => { + if (value === undefined || seen.has(value)) return; + seen.add(value); + candidates.push(value); + + const record = asRecord(value); + if (!record) return; + + for (const key of ['data', 'output', 'result', 'response']) { + visit(record[key]); + } + }; + + visit(agentResponse); + return candidates; +} + +/** Count fields on a parsed object that are present and non-trivially empty. */ +export function countPopulatedAgentFields(parsed: unknown): number { + if (!parsed || typeof parsed !== 'object') return 0; + let count = 0; + for (const value of Object.values(parsed as Record)) { + if (value === null || value === undefined) continue; + if (Array.isArray(value) && value.length === 0) continue; + if ( + typeof value === 'object' && + !Array.isArray(value) && + Object.keys(value as Record).length === 0 + ) + continue; + count += 1; + } + return count; +} diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-prompt.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-prompt.ts new file mode 100644 index 0000000000..4cd4059b50 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-prompt.ts @@ -0,0 +1,54 @@ +/** + * Builds the Firecrawl Agent prompt for core vendor risk research. + * + * Design intent: URL discovery is the primary goal, not certification + * extraction. The Agent often encounters JavaScript-only trust portals + * (e.g. Ubiquiti) whose markdown is empty until a browser executes it — + * if that happens, the Agent should still return the URL so the + * downstream `deepScrapeTrustPortal` orchestrator can handle SPA + * rendering via scrape actions. + */ +export function buildFirecrawlAgentPrompt(params: { + vendorName: string; + vendorWebsite: string; + vendorDomain: string; +}): string { + const { vendorName, vendorWebsite, vendorDomain } = params; + const vendorSlug = vendorName.toLowerCase().replace(/[^a-z0-9]+/g, ''); + + return `You are researching the security posture of "${vendorName}" (${vendorWebsite}). + +# Primary goal +Return a trust_center_url whenever the vendor has ANY trust, security, or compliance page — even if you cannot extract certification details from it. A downstream system will deep-scrape the URL you return. Your job is to FIND the URL reliably; extracting certifications yourself is a bonus, not a requirement. + +# Search method + +1. Start at ${vendorWebsite}. Scan the top-nav, footer, and any "Security", "Trust", "Legal", "Compliance", "Resources", or "About" menus. + +2. If nothing is surfaced in the nav, DIRECTLY visit these common paths on ${vendorDomain} and confirm they exist: + - /trust-center /trust /security /compliance + - /security-and-compliance /trust/overview /about/security + - Also check subdomains: trust.${vendorDomain}, security.${vendorDomain} + - Also check third-party portals: ${vendorSlug}.trust.page (SafeBase), ${vendorSlug}.safebase.io, ${vendorSlug}.vanta.com, ${vendorSlug}.drata.com + +3. Some vendor trust centers are JavaScript SPAs that render empty HTML without browser execution. If a trust page loads but the markdown looks thin or only contains navigation chrome (no security content at all), that's a SPA — STILL return its URL as trust_center_url. Do not discard it because you can't see the content. + +4. Many trust pages hide certifications behind tabs or sidebar sections (e.g. /trust-center#cloud-security on Ubiquiti, /trust-center/compliance). Visit as many sub-sections as you can; return any certifications you can extract from them. + +# Extraction rules for certifications + +Only return a certification when the page explicitly names a framework as current: SOC 2 Type I/II, ISO 27001/27017/27018/27701, ISO 42001, ISO 9001, FedRAMP, HIPAA, PCI DSS, GDPR, TISAX, CSA STAR, C5, NEN 7510. For each: +- status: "verified" when the page lists the framework as current (includes badge images, "we are certified", "compliant with X"). "expired" only if the page explicitly says so. "not_certified" only if the page explicitly says the vendor is NOT certified. "unknown" otherwise. +- Never invent a cert that isn't on the page. Never default to "not_certified". +- Include issued_at / expires_at dates only when printed on the page. + +# Output contract (strict) + +- links.trust_center_url — REQUIRED whenever any of these exist on the vendor's domain or a recognised third-party portal: a /trust*, /security*, /compliance* page; a trust. or security. subdomain; or a third-party trust portal. Return the best landing URL. Leave empty ONLY when you have confirmed no such page exists anywhere. +- links.privacy_policy_url, links.terms_of_service_url, links.security_page_url, links.soc2_report_url — return only when confirmed; otherwise empty. +- certifications — may be an empty array. Do NOT pad it. +- security_assessment — one paragraph summarising what you observed. If the trust portal was SPA-only and you could not read content, say so explicitly ("Trust portal at appears to be a JavaScript SPA; deep-scrape will extract content"). +- risk_level — your best estimate among critical/high/medium/low/very_low based on what you found. + +Focus on ${vendorWebsite} and its trust/security/compliance paths. Only cite URLs on ${vendorDomain}, its subdomains, or a recognised third-party portal hosting this vendor's trust page.`; +} diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-schema-json.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-schema-json.ts new file mode 100644 index 0000000000..310ee19892 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-schema-json.ts @@ -0,0 +1,95 @@ +/** + * JSON Schema passed to the Firecrawl Agent `agent()` call for core vendor + * research. Kept as a separate module so `firecrawl-agent-core.ts` stays + * under the 300-line project limit. + * + * This is the Firecrawl-side schema used to shape the LLM output; runtime + * validation of the parsed response happens in `agent-schema.ts` + * (vendorRiskAssessmentAgentSchema) via Zod. + */ +export const firecrawlAgentJsonSchema = { + type: 'object', + properties: { + risk_level: { + type: 'string', + description: + 'Overall vendor risk level: critical, high, medium, low, or very_low', + }, + security_assessment: { + type: 'string', + description: + 'A detailed paragraph summarizing the vendor security posture, including strengths, weaknesses, and key findings', + }, + last_researched_at: { + type: 'string', + description: 'ISO 8601 date of when this research was conducted', + }, + certifications: { + type: 'array', + description: + 'All security and compliance certifications found on the vendor website', + items: { + type: 'object', + properties: { + type: { + type: 'string', + description: + 'Certification name, e.g. SOC 2 Type II, ISO 27001, FedRAMP, HIPAA, PCI DSS, GDPR, ISO 42001, ISO 27017, ISO 27018, TISAX, CSA STAR, C5, etc.', + }, + status: { + type: 'string', + enum: ['verified', 'expired', 'not_certified', 'unknown'], + description: + 'Whether the certification is currently active/verified, expired, not certified, or unknown', + }, + issued_at: { + type: 'string', + description: + 'ISO 8601 date when the certification was issued, if mentioned', + }, + expires_at: { + type: 'string', + description: + 'ISO 8601 date when the certification expires, if mentioned', + }, + url: { + type: 'string', + description: + 'Direct URL to the certification report or trust page on the vendor domain', + }, + }, + required: ['type'], + }, + }, + links: { + type: 'object', + description: + 'Direct URLs to key legal and security pages on the vendor domain', + properties: { + privacy_policy_url: { + type: 'string', + description: 'Direct URL to the privacy policy page', + }, + terms_of_service_url: { + type: 'string', + description: 'Direct URL to the terms of service page', + }, + trust_center_url: { + type: 'string', + description: + 'Direct URL to the trust portal where customers can review security posture and request reports. Prefer the dedicated trust portal (often on trust.page, safebase.io, vanta.com, or a trust. subdomain) over documentation pages.', + }, + security_page_url: { + type: 'string', + description: + 'Direct URL to the security overview or security practices page', + }, + soc2_report_url: { + type: 'string', + description: 'Direct URL to request or download the SOC 2 report', + }, + }, + }, + }, + required: ['security_assessment'], +} as const; diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-shared.spec.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-shared.spec.ts new file mode 100644 index 0000000000..377dbda0df --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-shared.spec.ts @@ -0,0 +1,70 @@ +import { setupFirecrawlClient } from './firecrawl-agent-shared'; + +jest.mock('@trigger.dev/sdk', () => ({ + logger: { warn: jest.fn(), info: jest.fn(), debug: jest.fn() }, +})); + +jest.mock('@mendable/firecrawl-js', () => + jest.fn().mockImplementation(() => ({})), +); + +describe('setupFirecrawlClient', () => { + const originalApiKey = process.env.FIRECRAWL_API_KEY; + + beforeEach(() => { + process.env.FIRECRAWL_API_KEY = 'test-key'; + }); + + afterEach(() => { + if (originalApiKey === undefined) { + delete process.env.FIRECRAWL_API_KEY; + } else { + process.env.FIRECRAWL_API_KEY = originalApiKey; + } + }); + + it('includes trust-center and compliance seed URLs for stronger portal discovery', () => { + const setup = setupFirecrawlClient({ + vendorName: 'Ubiquiti', + vendorWebsite: 'https://www.ui.com', + }); + + expect(setup).not.toBeNull(); + expect(setup?.seedUrls).toEqual( + expect.arrayContaining([ + 'https://www.ui.com', + 'https://www.ui.com/trust', + 'https://www.ui.com/trust-center', + 'https://www.ui.com/trust-center#cloud-security', + 'https://www.ui.com/trust-center#corporate-security', + 'https://www.ui.com/trust-center#ndaa-compliance', + 'https://www.ui.com/security', + 'https://www.ui.com/security/trust-center', + 'https://www.ui.com/security/compliance', + 'https://www.ui.com/security-and-compliance', + 'https://www.ui.com/compliance', + ]), + ); + + // Keep seeds deduplicated to avoid wasting crawl credits. + expect(new Set(setup?.seedUrls).size).toBe(setup?.seedUrls.length); + }); + + it('adds www fallback seeds when vendor website is an apex domain', () => { + const setup = setupFirecrawlClient({ + vendorName: 'Ubiquiti', + vendorWebsite: 'https://ui.com', + }); + + expect(setup).not.toBeNull(); + expect(setup?.seedUrls).toEqual( + expect.arrayContaining([ + 'https://ui.com', + 'https://ui.com/trust-center#cloud-security', + 'https://www.ui.com', + 'https://www.ui.com/trust-center', + 'https://www.ui.com/trust-center#cloud-security', + ]), + ); + }); +}); diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-shared.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-shared.ts index 347df01eaa..c3b9727aa4 100644 --- a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-shared.ts +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent-shared.ts @@ -69,18 +69,47 @@ export function setupFirecrawlClient(params: { const firecrawlClient = new Firecrawl({ apiKey }); - const seedUrls = [ - origin, - `${origin}/privacy`, - `${origin}/privacy-policy`, - `${origin}/terms`, - `${origin}/terms-of-service`, - `${origin}/security`, - `${origin}/trust`, - `${origin}/legal`, - `${origin}/compliance`, + const origins = new Set([origin]); + try { + const originUrl = new URL(origin); + const host = originUrl.hostname.toLowerCase(); + // Firecrawl can occasionally fail on apex hosts even when the canonical + // site is served from www.. Include a safe fallback origin. + if (host === vendorDomain) { + origins.add(`${originUrl.protocol}//www.${vendorDomain}`); + } + } catch { + // Keep existing origin-only behavior if URL parsing unexpectedly fails. + } + + const seedUrlsFromOrigin = (baseOrigin: string): string[] => [ + baseOrigin, + `${baseOrigin}/trust`, + `${baseOrigin}/trust-center`, + `${baseOrigin}/trust-center#cloud-security`, + `${baseOrigin}/trust-center#corporate-security`, + `${baseOrigin}/trust-center#ndaa-compliance`, + `${baseOrigin}/security`, + `${baseOrigin}/security/trust-center`, + `${baseOrigin}/security/compliance`, + `${baseOrigin}/security-and-compliance`, + `${baseOrigin}/compliance`, + `${baseOrigin}/compliance/security`, + `${baseOrigin}/privacy`, + `${baseOrigin}/privacy-policy`, + `${baseOrigin}/terms`, + `${baseOrigin}/terms-of-service`, + `${baseOrigin}/legal`, ]; + const seedUrls = Array.from( + new Set([ + ...Array.from(origins).flatMap((baseOrigin) => + seedUrlsFromOrigin(baseOrigin), + ), + ]), + ); + return { firecrawlClient, origin, vendorDomain, seedUrls }; } diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-merge.spec.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-merge.spec.ts new file mode 100644 index 0000000000..c40dc27c46 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-merge.spec.ts @@ -0,0 +1,145 @@ +import type { VendorRiskAssessmentCertification } from './agent-types'; +import { mergeCertifications } from './trust-portal-deep-scrape-merge'; + +const cert = ( + overrides: Partial = {}, +): VendorRiskAssessmentCertification => ({ + type: 'SOC 2 Type II', + status: 'verified', + issuedAt: null, + expiresAt: null, + url: null, + ...overrides, +}); + +describe('mergeCertifications', () => { + it('returns core untouched when deep is empty', () => { + const core = [cert({ type: 'SOC 2 Type II' })]; + expect(mergeCertifications(core, [])).toEqual(core); + }); + + it('returns deep when core is empty', () => { + const deep = [cert({ type: 'ISO 27001' })]; + expect(mergeCertifications([], deep)).toEqual(deep); + }); + + it('dedupes by canonical slug (SOC 2 variants collapse)', () => { + const core = [cert({ type: 'SOC 2 Type II', status: 'verified' })]; + const deep = [cert({ type: 'SOC2', status: 'unknown' })]; + + const result = mergeCertifications(core, deep); + + expect(result).toHaveLength(1); + expect(result[0].status).toBe('verified'); + }); + + it('verified wins over unknown regardless of source side', () => { + const core = [cert({ type: 'ISO 27001', status: 'unknown' })]; + const deep = [cert({ type: 'ISO 27001', status: 'verified' })]; + + const result = mergeCertifications(core, deep); + + expect(result[0].status).toBe('verified'); + }); + + it('status priority: verified > expired > unknown > not_certified', () => { + const cases: Array<{ + a: VendorRiskAssessmentCertification['status']; + b: VendorRiskAssessmentCertification['status']; + expected: VendorRiskAssessmentCertification['status']; + }> = [ + { a: 'expired', b: 'unknown', expected: 'expired' }, + { a: 'unknown', b: 'not_certified', expected: 'unknown' }, + { a: 'verified', b: 'expired', expected: 'verified' }, + { a: 'not_certified', b: 'verified', expected: 'verified' }, + ]; + + for (const { a, b, expected } of cases) { + const result = mergeCertifications( + [cert({ type: 'PCI DSS', status: a })], + [cert({ type: 'PCI DSS', status: b })], + ); + expect(result).toHaveLength(1); + expect(result[0].status).toBe(expected); + } + }); + + it('preserves url/dates from whichever side provides them', () => { + const core = [ + cert({ + type: 'ISO 27001', + status: 'unknown', + url: null, + issuedAt: null, + }), + ]; + const deep = [ + cert({ + type: 'ISO 27001', + status: 'verified', + url: 'https://acme.com/iso.pdf', + issuedAt: '2025-03-01T00:00:00.000Z', + }), + ]; + + const result = mergeCertifications(core, deep); + + expect(result[0]).toMatchObject({ + type: 'ISO 27001', + status: 'verified', + url: 'https://acme.com/iso.pdf', + issuedAt: '2025-03-01T00:00:00.000Z', + }); + }); + + it('prefers core url/dates when both sides have them', () => { + const core = [ + cert({ + type: 'SOC 2 Type II', + status: 'verified', + url: 'https://core.example.com/soc2', + issuedAt: '2025-01-01T00:00:00.000Z', + }), + ]; + const deep = [ + cert({ + type: 'SOC 2 Type II', + status: 'verified', + url: 'https://deep.example.com/soc2', + issuedAt: '2024-01-01T00:00:00.000Z', + }), + ]; + + const result = mergeCertifications(core, deep); + + expect(result[0].url).toBe('https://core.example.com/soc2'); + expect(result[0].issuedAt).toBe('2025-01-01T00:00:00.000Z'); + }); + + it('keeps distinct certifications when slugs differ', () => { + const core = [cert({ type: 'SOC 2 Type II' })]; + const deep = [ + cert({ type: 'ISO 27001' }), + cert({ type: 'PCI DSS' }), + ]; + + const result = mergeCertifications(core, deep); + + expect(result).toHaveLength(3); + expect(result.map((c) => c.type).sort()).toEqual([ + 'ISO 27001', + 'PCI DSS', + 'SOC 2 Type II', + ]); + }); + + it('falls back to lowercased type when the slug mapper returns null', () => { + const core = [cert({ type: 'FooBar Framework', status: 'unknown' })]; + const deep = [cert({ type: 'foobar framework', status: 'verified' })]; + + const result = mergeCertifications(core, deep); + + expect(result).toHaveLength(1); + expect(result[0].status).toBe('verified'); + }); +}); diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-merge.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-merge.ts new file mode 100644 index 0000000000..93ec13877f --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-merge.ts @@ -0,0 +1,86 @@ +import type { + VendorRiskAssessmentCertification, + VendorRiskAssessmentCertificationStatus, +} from './agent-types'; + +// Inline slug mapper — mirrors `mapCertificationToBadgeType` in +// vendor-risk-assessment-task.ts but lives alongside the merge logic +// so this file has no upward dependency on the orchestrating task. +// Keep in sync if new frameworks are added there. +function canonicalSlug(type: string): string { + const normalized = type.toLowerCase().replace(/[^a-z0-9]/g, ''); + if (normalized.includes('soc2') || normalized.includes('soc 2')) return 'soc2'; + if (normalized.includes('iso27001') || normalized.includes('27001')) + return 'iso27001'; + if (normalized.includes('iso42001') || normalized.includes('42001')) + return 'iso42001'; + if (normalized.includes('iso9001') || normalized.includes('9001')) + return 'iso9001'; + if (normalized.includes('gdpr')) return 'gdpr'; + if (normalized.includes('hipaa')) return 'hipaa'; + if ( + normalized.includes('pcidss') || + normalized.includes('pci') || + normalized.includes('paymentcard') + ) + return 'pci_dss'; + if (normalized.includes('nen7510') || normalized.includes('7510')) + return 'nen7510'; + // Fallback: lowercased trimmed type string + return type.trim().toLowerCase(); +} + +const STATUS_PRIORITY: Record = + { + verified: 3, + expired: 2, + unknown: 1, + not_certified: 0, + }; + +function pickHigherStatus( + a: VendorRiskAssessmentCertificationStatus, + b: VendorRiskAssessmentCertificationStatus, +): VendorRiskAssessmentCertificationStatus { + return STATUS_PRIORITY[a] >= STATUS_PRIORITY[b] ? a : b; +} + +/** + * Merge certifications from the core Firecrawl Agent and the trust-portal + * deep-scrape, deduping by canonical slug. Status resolves via priority + * (verified > expired > unknown > not_certified). URL/dates prefer the + * core value when present; otherwise the deep value. + */ +export function mergeCertifications( + core: VendorRiskAssessmentCertification[], + deep: VendorRiskAssessmentCertification[], +): VendorRiskAssessmentCertification[] { + if (core.length === 0) return deep; + if (deep.length === 0) return core; + + const bySlug = new Map(); + + // Seed with core so its URL/date values win on ties. + for (const c of core) { + bySlug.set(canonicalSlug(c.type), { ...c }); + } + + for (const d of deep) { + const slug = canonicalSlug(d.type); + const existing = bySlug.get(slug); + if (!existing) { + bySlug.set(slug, { ...d }); + continue; + } + + bySlug.set(slug, { + type: existing.type, // keep core's display type + status: pickHigherStatus(existing.status, d.status), + issuedAt: existing.issuedAt ?? d.issuedAt ?? null, + expiresAt: existing.expiresAt ?? d.expiresAt ?? null, + url: existing.url ?? d.url ?? null, + }); + } + + return Array.from(bySlug.values()); +} diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-scrape-options.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-scrape-options.ts new file mode 100644 index 0000000000..33fcd62cc0 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-scrape-options.ts @@ -0,0 +1,107 @@ +import type { DeepScrapeSection } from './trust-portal-deep-scrape-sections'; + +/** + * Builders for the two kinds of Firecrawl `scrape` requests the trust-portal + * deep-scrape issues — the initial full-page pull, and the per-section pull + * that may need to click a sidebar item (by href, CSS selector, or text) to + * reveal the content. + */ + +const INITIAL_WAIT_MS = 3000; +const CLICK_WAIT_BEFORE_MS = 1500; +const CLICK_WAIT_AFTER_MS = 2000; +const PATH_WAIT_MS = 2000; +// Firecrawl scrape v2 `timeout` is capped at 300000ms. +const SCRAPE_TIMEOUT_MS = 120_000; + +/** Escape `"` and `\` for use inside a CSS double-quoted attribute value. */ +function cssEscapeAttr(value: string): string { + return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); +} + +/** + * JS payload that finds the smallest visible DOM element whose exact + * textContent matches `tabLabel` and clicks it. Used when a trust portal + * sidebar is composed of buttons/divs without href attributes. + */ +function buildClickByTextScript(tabLabel: string): string { + const safe = JSON.stringify(tabLabel); + return `(() => { + const label = ${safe}; + const candidates = Array.from( + document.querySelectorAll( + 'button, a, [role="tab"], [role="button"], [role="menuitem"], li, span, div' + ) + ) + .filter((el) => { + if (!el || typeof el.textContent !== 'string') return false; + if (el.textContent.trim() !== label) return false; + if (el.children && el.children.length > 2) return false; + if (typeof el.getBoundingClientRect === 'function') { + const rect = el.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return false; + } + return true; + }) + .sort((a, b) => (a.textContent || '').length - (b.textContent || '').length); + const target = candidates[0]; + if (target) { + try { target.scrollIntoView({ block: 'center' }); } catch {} + target.click(); + } +})();`; +} + +export function buildInitialScrapeOptions() { + return { + formats: ['markdown', 'links'] as const, + onlyMainContent: false, + timeout: SCRAPE_TIMEOUT_MS, + actions: [{ type: 'wait', milliseconds: INITIAL_WAIT_MS }], + }; +} + +export function buildSectionScrapeOptions(section: DeepScrapeSection) { + if (section.tabLabel) { + return { + formats: ['markdown'] as const, + onlyMainContent: true, + timeout: SCRAPE_TIMEOUT_MS, + actions: [ + { type: 'wait', milliseconds: CLICK_WAIT_BEFORE_MS }, + { + type: 'executeJavascript', + script: buildClickByTextScript(section.tabLabel), + }, + { type: 'wait', milliseconds: CLICK_WAIT_AFTER_MS }, + ], + }; + } + + if (section.anchor) { + const safeAnchor = cssEscapeAttr(section.anchor); + const safeLabel = cssEscapeAttr(section.label); + const selector = [ + `a[href="${safeAnchor}"]`, + `a[href$="${safeAnchor}"]`, + `[data-tab="${safeLabel}"]`, + ].join(', '); + return { + formats: ['markdown'] as const, + onlyMainContent: true, + timeout: SCRAPE_TIMEOUT_MS, + actions: [ + { type: 'wait', milliseconds: CLICK_WAIT_BEFORE_MS }, + { type: 'click', selector }, + { type: 'wait', milliseconds: CLICK_WAIT_AFTER_MS }, + ], + }; + } + + return { + formats: ['markdown'] as const, + onlyMainContent: true, + timeout: SCRAPE_TIMEOUT_MS, + actions: [{ type: 'wait', milliseconds: PATH_WAIT_MS }], + }; +} diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-sections.spec.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-sections.spec.ts new file mode 100644 index 0000000000..8a575f3b3b --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-sections.spec.ts @@ -0,0 +1,145 @@ +import { discoverSectionUrls } from './trust-portal-deep-scrape-sections'; + +describe('discoverSectionUrls', () => { + const sourceUrl = 'https://ui.com/us/en/trust-center'; + + it('extracts intra-page anchors on the same path', () => { + const links = [ + 'https://ui.com/us/en/trust-center#philosophy', + 'https://ui.com/us/en/trust-center#cloud-security', + 'https://ui.com/us/en/trust-center#corporate-security', + 'https://ui.com/us/en/trust-center#ndaa-compliance', + ]; + + const result = discoverSectionUrls({ sourceUrl, links }); + + expect(result.map((r) => r.url)).toEqual( + expect.arrayContaining([ + 'https://ui.com/us/en/trust-center#philosophy', + 'https://ui.com/us/en/trust-center#cloud-security', + 'https://ui.com/us/en/trust-center#corporate-security', + 'https://ui.com/us/en/trust-center#ndaa-compliance', + ]), + ); + expect(result).toHaveLength(4); + }); + + it('extracts same-path child URLs', () => { + const links = [ + 'https://acme.com/trust-center/cloud-security', + 'https://acme.com/trust-center/data-centers', + ]; + + const result = discoverSectionUrls({ + sourceUrl: 'https://acme.com/trust-center', + links, + }); + + expect(result.map((r) => r.url).sort()).toEqual([ + 'https://acme.com/trust-center/cloud-security', + 'https://acme.com/trust-center/data-centers', + ]); + }); + + it('rejects external-domain links', () => { + const links = [ + 'https://ui.com/us/en/trust-center#cloud-security', + 'https://example.com/trust', + 'https://malicious.site/trust-center#fake', + ]; + + const result = discoverSectionUrls({ sourceUrl, links }); + + expect(result).toHaveLength(1); + expect(result[0].url).toBe( + 'https://ui.com/us/en/trust-center#cloud-security', + ); + }); + + it('rejects the source URL itself', () => { + const links = [ + 'https://ui.com/us/en/trust-center', + 'https://ui.com/us/en/trust-center#cloud-security', + ]; + + const result = discoverSectionUrls({ sourceUrl, links }); + + expect(result.map((r) => r.url)).toEqual([ + 'https://ui.com/us/en/trust-center#cloud-security', + ]); + }); + + it('dedupes identical URLs', () => { + const links = [ + 'https://ui.com/us/en/trust-center#cloud-security', + 'https://ui.com/us/en/trust-center#cloud-security', + ]; + + const result = discoverSectionUrls({ sourceUrl, links }); + + expect(result).toHaveLength(1); + }); + + it('caps at 25 sections (safety fuse)', () => { + const links = Array.from( + { length: 40 }, + (_, i) => `https://ui.com/us/en/trust-center#section-${i}`, + ); + + const result = discoverSectionUrls({ sourceUrl, links }); + + expect(result).toHaveLength(25); + }); + + it('handles source URLs with trailing slash', () => { + const links = ['https://acme.com/trust-center/cloud-security']; + + const result = discoverSectionUrls({ + sourceUrl: 'https://acme.com/trust-center/', + links, + }); + + expect(result).toHaveLength(1); + expect(result[0].url).toBe('https://acme.com/trust-center/cloud-security'); + }); + + it('skips unparseable links silently', () => { + const links = [ + 'not-a-url', + '', + 'https://ui.com/us/en/trust-center#cloud-security', + ]; + + const result = discoverSectionUrls({ sourceUrl, links }); + + expect(result).toHaveLength(1); + }); + + it('derives a section label from the anchor fragment', () => { + const links = ['https://ui.com/us/en/trust-center#cloud-security']; + + const result = discoverSectionUrls({ sourceUrl, links }); + + expect(result[0].label).toBe('cloud-security'); + expect(result[0].anchor).toBe('#cloud-security'); + }); + + it('derives a section label from the trailing path segment', () => { + const links = ['https://acme.com/trust-center/cloud-security']; + + const result = discoverSectionUrls({ + sourceUrl: 'https://acme.com/trust-center', + links, + }); + + expect(result[0].label).toBe('cloud-security'); + expect(result[0].anchor).toBeNull(); + }); + + it('returns an empty array when links is undefined or empty', () => { + expect(discoverSectionUrls({ sourceUrl, links: [] })).toEqual([]); + expect( + discoverSectionUrls({ sourceUrl, links: undefined as unknown as string[] }), + ).toEqual([]); + }); +}); diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-sections.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-sections.ts new file mode 100644 index 0000000000..47c9b2afe0 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-sections.ts @@ -0,0 +1,101 @@ +// Pure helper: convert a Firecrawl scrape's `links` array into an ordered, +// deduped list of section URLs for the trust-portal deep-scrape pass. +// +// A "section URL" is either: +// - an intra-page anchor on the same path as the source URL (e.g. `/trust-center#cloud-security`) +// - a same-origin URL whose path is nested under the source path (e.g. `/trust-center/cloud-security`) +// +// Cross-origin links, the source URL itself, and duplicates are dropped. + +export const MAX_SECTION_URLS = 25; + +export type DeepScrapeSection = { + url: string; + /** The anchor fragment including the `#` (e.g. `#cloud-security`), or null for path-based sections. */ + anchor: string | null; + /** A human-friendly label used for logging and markdown section headers. */ + label: string; + /** + * When present, the section must be revealed by clicking a DOM element whose + * textContent equals this value. Used for SPA trust portals where sidebar + * items are buttons/divs without href attributes (e.g. Ubiquiti). + */ + tabLabel?: string | null; +}; + +function stripTrailingSlash(path: string): string { + return path.length > 1 && path.endsWith('/') ? path.slice(0, -1) : path; +} + +function deriveLabel(sectionUrl: URL, anchor: string | null): string { + if (anchor) { + return anchor.slice(1); // drop leading `#` + } + const segments = stripTrailingSlash(sectionUrl.pathname).split('/'); + return segments[segments.length - 1] || sectionUrl.pathname; +} + +export function discoverSectionUrls(params: { + sourceUrl: string; + links: string[]; +}): DeepScrapeSection[] { + const { sourceUrl, links } = params; + if (!links || links.length === 0) return []; + + let source: URL; + try { + source = new URL(sourceUrl); + } catch { + return []; + } + + const sourceOrigin = source.origin; + const sourcePath = stripTrailingSlash(source.pathname); + const sourceCanonical = `${sourceOrigin}${sourcePath}`; + + const seen = new Set(); + const sections: DeepScrapeSection[] = []; + + for (const raw of links) { + if (sections.length >= MAX_SECTION_URLS) break; + if (!raw || typeof raw !== 'string') continue; + + let parsed: URL; + try { + parsed = new URL(raw); + } catch { + continue; + } + + if (parsed.origin !== sourceOrigin) continue; + + const parsedPath = stripTrailingSlash(parsed.pathname); + const hasFragment = parsed.hash && parsed.hash.length > 1; + + const isIntraPageAnchor = parsedPath === sourcePath && hasFragment; + const isSamePathChild = + !hasFragment && + parsedPath !== sourcePath && + (parsedPath.startsWith(`${sourcePath}/`) || + (sourcePath === '' && parsedPath.startsWith('/'))); + + if (!isIntraPageAnchor && !isSamePathChild) continue; + + const anchor = isIntraPageAnchor ? parsed.hash : null; + const canonical = anchor + ? `${sourceCanonical}${anchor}` + : `${sourceOrigin}${parsedPath}`; + + if (canonical === sourceCanonical) continue; + if (seen.has(canonical)) continue; + seen.add(canonical); + + sections.push({ + url: canonical, + anchor, + label: deriveLabel(new URL(canonical), anchor), + }); + } + + return sections; +} diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-tabs.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-tabs.ts new file mode 100644 index 0000000000..934334d1c7 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape-tabs.ts @@ -0,0 +1,92 @@ +import { logger } from '@trigger.dev/sdk'; +import { anthropic } from '@ai-sdk/anthropic'; +import { generateObject } from 'ai'; +import { z } from 'zod'; + +/** + * Some trust portals are SPAs whose sidebar items are buttons/divs without + * href attributes — Firecrawl's `links` format doesn't enumerate them. + * When URL-based section discovery yields nothing, ask Claude Sonnet 4.6 + * to extract sidebar/tab labels from the initial markdown so the orchestrator + * can click each by text content. + */ + +const TAB_MODEL = 'claude-sonnet-4-6'; +const MAX_TABS = 15; +const MARKDOWN_LIMIT = 12_000; + +const tabSchema = z.object({ + tabLabels: z + .array(z.string()) + .describe( + 'Sidebar/tab labels present on the trust portal landing page. Each label is a short phrase (1-4 words) that, when clicked, reveals additional security/compliance content. Return an empty array if no such items exist.', + ) + .default([]), +}); + +function buildPrompt(args: { + vendorName: string; + initialMarkdown: string; +}): string { + return `You are analyzing the markdown of a vendor's trust portal landing page. + +Some trust portals are single-page apps where sidebar/tab items don't have real href URLs — they're buttons that reveal additional security/compliance content when clicked. Your job is to identify those sidebar/tab labels so a downstream scraper can programmatically click each one. + +Vendor: ${args.vendorName} + +Include labels that: +- Look like sidebar/tab nav items (typically 1-4 words, e.g. "Cloud Security", "NDAA Compliance", "Corporate Security", "Certifications", "Reports", "Data Centers", "Subprocessors", "Bug Bounty Program", "Advisory Bulletins", "Overview", "Policies"). +- Sit inside or near the trust/security content region of the page. + +Exclude: +- Site-wide navigation labels ("Home", "Products", "Store", "Support", "Contact Us", "Careers", "Blog", "Training", "Investor Relations", "What's New"). +- Footer / legal items ("Privacy Policy", "Terms of Service", "Legal"). +- Product category labels ("Cloud Gateways", "Switching", "WiFi", "Camera Security", "Door Access", "Integrations"). + +Return at most ${MAX_TABS} labels. Return an empty array if you see no sidebar/tab items. + +Markdown: + +${args.initialMarkdown.slice(0, MARKDOWN_LIMIT)}`; +} + +export async function identifySidebarTabs(params: { + vendorName: string; + initialMarkdown: string; +}): Promise { + const { vendorName, initialMarkdown } = params; + + if (!initialMarkdown || initialMarkdown.trim().length === 0) { + return []; + } + + try { + const { object } = await generateObject({ + model: anthropic(TAB_MODEL), + schema: tabSchema, + prompt: buildPrompt({ vendorName, initialMarkdown }), + }); + + const deduped = Array.from( + new Set( + (object.tabLabels ?? []) + .map((l) => l.trim()) + .filter((l) => l.length > 0 && l.length <= 60), + ), + ).slice(0, MAX_TABS); + + logger.info('Trust portal deep-scrape: tab labels identified', { + vendorName, + count: deduped.length, + tabLabelsJson: JSON.stringify(deduped), + }); + + return deduped; + } catch (error) { + logger.warn('Trust portal deep-scrape: tab identification failed', { + vendorName, + error: error instanceof Error ? error.message : String(error), + }); + return []; + } +} diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape.spec.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape.spec.ts new file mode 100644 index 0000000000..c040aab360 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape.spec.ts @@ -0,0 +1,471 @@ +import { deepScrapeTrustPortal } from './trust-portal-deep-scrape'; + +jest.mock('@trigger.dev/sdk', () => ({ + logger: { + warn: jest.fn(), + info: jest.fn(), + debug: jest.fn(), + error: jest.fn(), + }, +})); + +jest.mock('@ai-sdk/anthropic', () => ({ + anthropic: jest.fn(() => 'claude-mock-model'), +})); + +const generateObjectMock = jest.fn(); +jest.mock('ai', () => ({ + generateObject: (...args: unknown[]) => generateObjectMock(...args), +})); + +type ScrapeMock = jest.Mock< + Promise<{ markdown?: string; links?: string[] }>, + [string, Record?] +>; + +function makeFirecrawlMock(scrape: ScrapeMock) { + return { scrape } as unknown as import('@mendable/firecrawl-js').default; +} + +describe('deepScrapeTrustPortal — gate', () => { + beforeEach(() => { + generateObjectMock.mockReset(); + }); + + it('returns null when sourceUrl is null', async () => { + const scrape = jest.fn(); + const result = await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl: null, + firecrawlClient: makeFirecrawlMock(scrape as ScrapeMock), + }); + expect(result).toBeNull(); + expect(scrape).not.toHaveBeenCalled(); + }); + + it('returns null when source URL is on a known third-party portal host', async () => { + const scrape = jest.fn(); + const result = await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl: 'https://acme.trust.page', + firecrawlClient: makeFirecrawlMock(scrape as ScrapeMock), + }); + expect(result).toBeNull(); + expect(scrape).not.toHaveBeenCalled(); + }); + + it('returns null when source URL is not on the vendor domain', async () => { + const scrape = jest.fn(); + const result = await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl: 'https://some-other-site.com/trust', + firecrawlClient: makeFirecrawlMock(scrape as ScrapeMock), + }); + expect(result).toBeNull(); + expect(scrape).not.toHaveBeenCalled(); + }); + + it('returns null when source URL is unparseable', async () => { + const scrape = jest.fn(); + const result = await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl: 'not a url', + firecrawlClient: makeFirecrawlMock(scrape as ScrapeMock), + }); + expect(result).toBeNull(); + expect(scrape).not.toHaveBeenCalled(); + }); +}); + +describe('deepScrapeTrustPortal — extraction', () => { + beforeEach(() => { + generateObjectMock.mockReset(); + }); + + it('extracts SOC 2, ISO 27001, PCI-DSS from a Ubiquiti-shaped SPA trust portal', async () => { + const sourceUrl = 'https://ui.com/us/en/trust-center'; + + const scrape: ScrapeMock = jest + .fn() + // Initial scrape returns the landing page + all sidebar links + .mockResolvedValueOnce({ + markdown: '# Secure by Design\nUbiquiti trust overview.', + links: [ + 'https://ui.com/us/en/trust-center', + 'https://ui.com/us/en/trust-center#philosophy', + 'https://ui.com/us/en/trust-center#ndaa-compliance', + 'https://ui.com/us/en/trust-center#cloud-security', + 'https://ui.com/us/en/trust-center#corporate-security', + ], + }) + // Per-section scrapes + .mockResolvedValueOnce({ markdown: '# Philosophy\nSecurity first.' }) + .mockResolvedValueOnce({ + markdown: + '# NDAA Compliance\nUbiquiti products are NDAA Section 889 compliant.', + }) + .mockResolvedValueOnce({ + markdown: + '# Cloud Security\n\nBadges: Soc 2 Type II, ISO/IEC 27001:2013, PCI-DSS. All verified.', + }) + .mockResolvedValueOnce({ + markdown: + '# Corporate Security\nPolicies covering employees and contractors.', + }); + + generateObjectMock.mockResolvedValueOnce({ + object: { + certifications: [ + { + type: 'SOC 2 Type II', + status: 'verified', + evidence_snippet: 'Soc 2 Type II', + }, + { + type: 'ISO 27001', + status: 'verified', + evidence_snippet: 'ISO/IEC 27001:2013', + }, + { + type: 'PCI DSS', + status: 'verified', + evidence_snippet: 'PCI-DSS', + }, + ], + }, + }); + + const result = await deepScrapeTrustPortal({ + vendorName: 'Ubiquiti', + vendorDomain: 'ui.com', + sourceUrl, + firecrawlClient: makeFirecrawlMock(scrape), + }); + + expect(result).not.toBeNull(); + expect(result).toHaveLength(3); + expect(result?.map((c) => c.type).sort()).toEqual([ + 'ISO 27001', + 'PCI DSS', + 'SOC 2 Type II', + ]); + expect(result?.every((c) => c.status === 'verified')).toBe(true); + + // 1 initial + 4 sections = 5 scrape calls + expect(scrape).toHaveBeenCalledTimes(5); + + // First call should be the source URL with a wait action. + expect(scrape).toHaveBeenNthCalledWith( + 1, + sourceUrl, + expect.objectContaining({ + formats: expect.arrayContaining(['markdown', 'links']), + onlyMainContent: false, + }), + ); + + // AI extraction called once with combined markdown. + expect(generateObjectMock).toHaveBeenCalledTimes(1); + const aiCall = generateObjectMock.mock.calls[0][0]; + expect(aiCall.prompt).toContain('Cloud Security'); + expect(aiCall.prompt).toContain('PCI-DSS'); + }); + + it('continues with remaining sections when one scrape fails', async () => { + const scrape: ScrapeMock = jest + .fn() + .mockResolvedValueOnce({ + markdown: '# Landing', + links: [ + 'https://acme.com/trust#one', + 'https://acme.com/trust#two', + ], + }) + .mockRejectedValueOnce(new Error('network timeout')) + .mockResolvedValueOnce({ + markdown: '# Two\nWe are SOC 2 Type II verified.', + }); + + generateObjectMock.mockResolvedValueOnce({ + object: { + certifications: [ + { + type: 'SOC 2 Type II', + status: 'verified', + evidence_snippet: 'SOC 2 Type II verified', + }, + ], + }, + }); + + const result = await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl: 'https://acme.com/trust', + firecrawlClient: makeFirecrawlMock(scrape), + }); + + expect(result).toEqual([ + expect.objectContaining({ type: 'SOC 2 Type II', status: 'verified' }), + ]); + }); + + it('returns null when the initial scrape fails', async () => { + const scrape: ScrapeMock = jest + .fn() + .mockRejectedValueOnce(new Error('network error')); + + const result = await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl: 'https://acme.com/trust', + firecrawlClient: makeFirecrawlMock(scrape), + }); + + expect(result).toBeNull(); + }); + + it('returns null when AI extraction throws', async () => { + const scrape: ScrapeMock = jest.fn().mockResolvedValueOnce({ + markdown: '# Trust center content', + links: [], + }); + // First generateObject call is identifySidebarTabs; return no tabs so + // the flow proceeds straight to cert extraction. + generateObjectMock.mockResolvedValueOnce({ object: { tabLabels: [] } }); + generateObjectMock.mockRejectedValueOnce(new Error('model error')); + + const result = await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl: 'https://acme.com/trust', + firecrawlClient: makeFirecrawlMock(scrape), + }); + + expect(result).toBeNull(); + }); + + it('drops extracted certs whose evidence_snippet is empty', async () => { + const scrape: ScrapeMock = jest.fn().mockResolvedValueOnce({ + markdown: '# Trust', + links: [], + }); + + generateObjectMock.mockResolvedValueOnce({ object: { tabLabels: [] } }); + generateObjectMock.mockResolvedValueOnce({ + object: { + certifications: [ + { + type: 'SOC 2 Type II', + status: 'verified', + evidence_snippet: 'SOC 2 Type II report available on request', + }, + { type: 'Totally Made Up Cert', status: 'verified', evidence_snippet: '' }, + ], + }, + }); + + const result = await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl: 'https://acme.com/trust', + firecrawlClient: makeFirecrawlMock(scrape), + }); + + expect(result).toHaveLength(1); + expect(result?.[0].type).toBe('SOC 2 Type II'); + }); + + it('runs AI extraction on initial markdown when there are no sidebar sections', async () => { + const scrape: ScrapeMock = jest.fn().mockResolvedValueOnce({ + markdown: + '# Trust\nWe hold SOC 2 Type II and ISO 27001 certifications.', + links: [], + }); + + generateObjectMock.mockResolvedValueOnce({ object: { tabLabels: [] } }); + generateObjectMock.mockResolvedValueOnce({ + object: { + certifications: [ + { + type: 'SOC 2 Type II', + status: 'verified', + evidence_snippet: 'SOC 2 Type II', + }, + { + type: 'ISO 27001', + status: 'verified', + evidence_snippet: 'ISO 27001', + }, + ], + }, + }); + + const result = await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl: 'https://acme.com/trust', + firecrawlClient: makeFirecrawlMock(scrape), + }); + + expect(scrape).toHaveBeenCalledTimes(1); + expect(result?.map((c) => c.type).sort()).toEqual([ + 'ISO 27001', + 'SOC 2 Type II', + ]); + }); + + it('discovers SPA tab labels via LLM and scrapes each by clicking text', async () => { + const scrape: ScrapeMock = jest + .fn() + .mockResolvedValueOnce({ + markdown: + '# Secure by Design\nPhilosophy\nNDAA Compliance\nCloud Security', + links: [], // No sidebar anchors — triggers tab-label discovery + }) + .mockResolvedValueOnce({ + markdown: '# Philosophy\nWe believe in edge-first security.', + }) + .mockResolvedValueOnce({ + markdown: '# Cloud Security\nSOC 2 Type II, ISO 27001, PCI-DSS.', + }); + + // First LLM call: sidebar tabs. Second: cert extraction. + generateObjectMock.mockResolvedValueOnce({ + object: { tabLabels: ['Philosophy', 'Cloud Security'] }, + }); + generateObjectMock.mockResolvedValueOnce({ + object: { + certifications: [ + { + type: 'SOC 2 Type II', + status: 'verified', + evidence_snippet: 'SOC 2 Type II', + }, + { + type: 'ISO 27001', + status: 'verified', + evidence_snippet: 'ISO 27001', + }, + ], + }, + }); + + const result = await deepScrapeTrustPortal({ + vendorName: 'Ubiquiti', + vendorDomain: 'ui.com', + sourceUrl: 'https://ui.com/trust-center', + firecrawlClient: makeFirecrawlMock(scrape), + }); + + // 1 initial + 2 tab-label scrapes = 3 scrape calls + expect(scrape).toHaveBeenCalledTimes(3); + + // Each tab scrape must use executeJavascript click-by-text actions. + const tabCall = scrape.mock.calls[1]; + const actions = + (tabCall[1] as { actions?: Array<{ type: string; script?: string }> }) + ?.actions ?? []; + const jsAction = actions.find((a) => a.type === 'executeJavascript'); + expect(jsAction?.script).toBeDefined(); + expect(jsAction?.script).toContain('"Philosophy"'); + + expect(result?.map((c) => c.type).sort()).toEqual([ + 'ISO 27001', + 'SOC 2 Type II', + ]); + }); + + it('escapes CSS special characters in anchor selectors', async () => { + // Use a backslash in the anchor: `\` is a CSS special character that must + // be escaped as `\\` inside attribute values, and it survives URL parsing + // (unlike `"` which browsers percent-encode to `%22` in the fragment). + const scrape: ScrapeMock = jest + .fn() + .mockResolvedValueOnce({ + markdown: '# Landing', + links: ['https://acme.com/trust#weird\\section'], + }) + .mockResolvedValueOnce({ markdown: '# Weird\nWe are ISO 27001 certified.' }); + + generateObjectMock.mockResolvedValueOnce({ + object: { + certifications: [ + { + type: 'ISO 27001', + status: 'verified', + evidence_snippet: 'ISO 27001 certified', + }, + ], + }, + }); + + await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl: 'https://acme.com/trust', + firecrawlClient: makeFirecrawlMock(scrape), + }); + + // The second call is the section scrape. Its selector should contain the + // escaped backslash (`\\`) not the raw single backslash. + const sectionCall = scrape.mock.calls[1]; + const actions = (sectionCall[1] as { actions?: Array<{ type: string; selector?: string }> })?.actions ?? []; + const clickAction = actions.find((a) => a.type === 'click'); + expect(clickAction?.selector).toBeDefined(); + // cssEscapeAttr converts `\` → `\\`, so the selector contains `\\section` + expect(clickAction?.selector).toContain('#weird\\\\section'); + // Raw single backslash should NOT appear unescaped in the selector string + expect(clickAction?.selector).not.toMatch(/#weird\\[^\\]/); + }); + + it('scrapes every section exactly once when section count exceeds concurrency bound', async () => { + const anchors = Array.from({ length: 8 }, (_, i) => `#section-${i}`); + const sourceUrl = 'https://acme.com/trust'; + + const scrape: ScrapeMock = jest.fn(async (url: string) => { + if (url === sourceUrl) { + return { + markdown: '# Landing', + links: anchors.map((a) => `${sourceUrl}${a}`), + }; + } + return { markdown: `# ${url}\nplaceholder` }; + }) as ScrapeMock; + + generateObjectMock.mockResolvedValueOnce({ + object: { + certifications: [ + { + type: 'SOC 2 Type II', + status: 'verified', + evidence_snippet: 'SOC 2 Type II', + }, + ], + }, + }); + + await deepScrapeTrustPortal({ + vendorName: 'Acme', + vendorDomain: 'acme.com', + sourceUrl, + firecrawlClient: makeFirecrawlMock(scrape), + }); + + // 1 initial + 8 sections = 9 scrape calls + expect(scrape).toHaveBeenCalledTimes(9); + + // Each section URL should have been requested exactly once. + const sectionCalls = scrape.mock.calls + .slice(1) + .map((call) => call[0] as string); + expect(new Set(sectionCalls).size).toBe(8); + for (const anchor of anchors) { + expect(sectionCalls).toContain(`${sourceUrl}${anchor}`); + } + }); +}); diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape.ts new file mode 100644 index 0000000000..dfd759b6cb --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/trust-portal-deep-scrape.ts @@ -0,0 +1,293 @@ +import Firecrawl from '@mendable/firecrawl-js'; +import { logger } from '@trigger.dev/sdk'; +import { anthropic } from '@ai-sdk/anthropic'; +import { generateObject } from 'ai'; +import { z } from 'zod'; +import type { + VendorRiskAssessmentCertification, + VendorRiskAssessmentCertificationStatus, +} from './agent-types'; +import { isKnownThirdPartyPortalHost } from './url-validation'; +import { + discoverSectionUrls, + MAX_SECTION_URLS, + type DeepScrapeSection, +} from './trust-portal-deep-scrape-sections'; +import { identifySidebarTabs } from './trust-portal-deep-scrape-tabs'; +import { + buildInitialScrapeOptions, + buildSectionScrapeOptions, +} from './trust-portal-deep-scrape-scrape-options'; + +const EXTRACTION_MODEL = 'claude-sonnet-4-6'; +const SECTION_CONCURRENCY = 5; +const MARKDOWN_TRUNCATE_LIMIT = 200_000; + +const certificationExtractionSchema = z.object({ + certifications: z.array(z.object({ + type: z.string().describe( + 'Canonical certification name, e.g. "SOC 2 Type II", "ISO 27001", "PCI DSS", "ISO 27017", "FedRAMP", "HIPAA", "GDPR", "ISO 42001"', + ), + status: z.enum(['verified', 'expired', 'not_certified', 'unknown']).describe( + 'verified when the page lists this framework as current; expired only if explicitly said so; not_certified only if the page explicitly says so; unknown otherwise', + ), + issued_at: z.string().optional().nullable(), + expires_at: z.string().optional().nullable(), + evidence_snippet: z.string().describe( + 'Short quote from the markdown (< 200 chars) that supports this certification. Must be present in the markdown verbatim.', + ), + })).default([]), +}); + +type ScrapeResponse = { markdown?: string; links?: string[] }; + +function truncateMarkdown(input: string): string { + if (input.length <= MARKDOWN_TRUNCATE_LIMIT) return input; + logger.warn('Trust portal combined markdown truncated for extraction', { + originalLength: input.length, + limit: MARKDOWN_TRUNCATE_LIMIT, + }); + return input.slice(0, MARKDOWN_TRUNCATE_LIMIT); +} + +function buildExtractionPrompt(args: { + vendorName: string; + combinedMarkdown: string; +}): string { + return `You are extracting security and compliance certifications from a vendor's trust center page. + +Vendor: ${args.vendorName} + +Rules: +- Only return certifications that are explicitly listed in the markdown below. +- Never invent certifications. If a certification is not mentioned, do not include it. +- Mark status as "verified" when the page lists it as a current/active framework (including badge callouts and "we are certified" language). +- Mark status as "expired" only when the page explicitly says the certification has lapsed. +- Mark status as "not_certified" only when the page explicitly says the vendor is not certified. +- Otherwise use "unknown". +- Normalize the type name to canonical form (e.g. "Soc 2 Type II" → "SOC 2 Type II", "ISO/IEC 27001:2013" → "ISO 27001", "PCI-DSS" → "PCI DSS"). +- Always include evidence_snippet with a verbatim quote from the markdown. Certifications without an evidence_snippet will be discarded. + +Markdown from the trust portal and its sections: + +${args.combinedMarkdown}`; +} + + +async function mapWithConcurrency( + items: T[], + concurrency: number, + worker: (item: T) => Promise, +): Promise>> { + const results: Array> = new Array(items.length); + let cursor = 0; + const runners = Array.from({ length: Math.min(concurrency, items.length) }, async () => { + while (true) { + const index = cursor++; + if (index >= items.length) return; + try { + results[index] = { status: 'fulfilled', value: await worker(items[index]) }; + } catch (reason) { + results[index] = { status: 'rejected', reason }; + } + } + }); + await Promise.all(runners); + return results; +} + +export type DeepScrapeParams = { + vendorName: string; + vendorDomain: string; + sourceUrl: string | null; + firecrawlClient: Firecrawl; +}; + +export async function deepScrapeTrustPortal( + params: DeepScrapeParams, +): Promise { + const { vendorName, vendorDomain, sourceUrl, firecrawlClient } = params; + + if (!sourceUrl) return null; + + let source: URL; + try { + source = new URL(sourceUrl); + } catch { + return null; + } + + const host = source.hostname.toLowerCase(); + if (isKnownThirdPartyPortalHost(host)) { + logger.info( + 'Trust portal deep-scrape skipped: third-party portal host already handled by agent', + { vendorName, host }, + ); + return null; + } + + const onVendorDomain = + host === vendorDomain || host.endsWith(`.${vendorDomain}`); + if (!onVendorDomain) { + logger.info( + 'Trust portal deep-scrape skipped: source URL is not on vendor domain', + { vendorName, host, vendorDomain }, + ); + return null; + } + + logger.info('Trust portal deep-scrape starting', { + vendorName, + sourceUrl, + }); + // 1. Initial scrape + let initial: ScrapeResponse; + try { + initial = (await firecrawlClient.scrape( + sourceUrl, + buildInitialScrapeOptions() as unknown as Record, + )) as ScrapeResponse; + } catch (error) { + logger.warn('Trust portal deep-scrape: initial scrape failed', { + vendorName, + sourceUrl, + error: error instanceof Error ? error.message : String(error), + }); + return null; + } + + const initialMarkdown = initial.markdown ?? ''; + const links = Array.isArray(initial.links) ? initial.links : []; + logger.info('Trust portal deep-scrape: initial scrape returned', { + vendorName, + sourceUrl, + markdownLength: initialMarkdown.length, + linkCount: links.length, + }); + // 2. Discover sections + const urlSections = discoverSectionUrls({ sourceUrl, links }); + + // 2a. If URL-based discovery found nothing (SPA sidebar with no hrefs), + // ask an LLM to identify tab labels from the initial markdown and + // synthesize click-by-text sections. + const tabSections: DeepScrapeSection[] = + urlSections.length === 0 && initialMarkdown.trim().length > 0 + ? (await identifySidebarTabs({ vendorName, initialMarkdown })).map( + (tabLabel) => ({ + url: sourceUrl, + anchor: null, + label: tabLabel, + tabLabel, + }), + ) + : []; + + const seenLabels = new Set(); + const sections: DeepScrapeSection[] = []; + for (const s of [...urlSections, ...tabSections]) { + const key = s.label.trim().toLowerCase(); + if (!key || seenLabels.has(key)) continue; + seenLabels.add(key); + sections.push(s); + if (sections.length >= MAX_SECTION_URLS) break; + } + + logger.info('Trust portal deep-scrape: sections discovered', { + vendorName, + sectionCount: sections.length, + urlSectionCount: urlSections.length, + tabSectionCount: tabSections.length, + sections: sections.map((s) => s.label), + }); + // 3. Per-section scrapes (bounded concurrency) + const sectionResults = await mapWithConcurrency( + sections, + SECTION_CONCURRENCY, + async (section) => { + const response = (await firecrawlClient.scrape( + section.url, + buildSectionScrapeOptions(section) as unknown as Record< + string, + unknown + >, + )) as ScrapeResponse; + return { section, markdown: response.markdown ?? '' }; + }, + ); + + const sectionChunks: string[] = []; + for (const [index, result] of sectionResults.entries()) { + if (result.status === 'fulfilled') { + const { section, markdown } = result.value; + if (markdown.trim().length > 0) { + sectionChunks.push( + `\n\n---\n# Section: ${section.label}\n\n${markdown}`, + ); + } + } else { + logger.warn('Trust portal deep-scrape: section scrape failed', { + vendorName, + section: sections[index].label, + error: + result.reason instanceof Error + ? result.reason.message + : String(result.reason), + }); + } + } + + const combinedMarkdown = truncateMarkdown( + [initialMarkdown, ...sectionChunks].join(''), + ); + + if (combinedMarkdown.trim().length === 0) { + logger.warn( + 'Trust portal deep-scrape: combined markdown is empty, skipping extraction', + { vendorName, sourceUrl }, + ); + return null; + } + // 4. AI extraction + type ExtractedCert = { + type: string; status: VendorRiskAssessmentCertificationStatus; + issued_at?: string | null; expires_at?: string | null; evidence_snippet: string; + }; + let extracted: { certifications: ExtractedCert[] }; + try { + const { object } = await generateObject({ + model: anthropic(EXTRACTION_MODEL), + schema: certificationExtractionSchema, + prompt: buildExtractionPrompt({ vendorName, combinedMarkdown }), + }); + extracted = object; + } catch (error) { + logger.warn('Trust portal deep-scrape: AI extraction failed', { + vendorName, + error: error instanceof Error ? error.message : String(error), + }); + return null; + } + + const certifications: VendorRiskAssessmentCertification[] = + extracted.certifications + .filter( + (c) => c.evidence_snippet && c.evidence_snippet.trim().length > 0, + ) + .map((c) => ({ + type: c.type, + status: c.status, + issuedAt: c.issued_at ?? null, + expiresAt: c.expires_at ?? null, + url: null, + })); + + logger.info('Trust portal deep-scrape: completed', { + vendorName, + certificationCount: certifications.length, + sectionCount: sections.length, + initialMarkdownLength: initialMarkdown.length, + combinedMarkdownLength: combinedMarkdown.length, + }); + + return certifications.length > 0 ? certifications : null; +} diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.ts index 454daf6bf4..18bff82c12 100644 --- a/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.ts +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.ts @@ -2,7 +2,7 @@ import { logger } from '@trigger.dev/sdk'; import { getDomain } from 'tldts'; // Well-known trust portal domains that vendors use to host their security pages -const TRUSTED_PORTAL_DOMAINS = [ +export const TRUSTED_PORTAL_DOMAINS = [ 'trust.page', // SafeBase 'vanta.com', // Vanta trust centers 'drata.com', // Drata trust centers @@ -100,3 +100,16 @@ export function validateVendorUrl( return null; } } + +/** + * Returns true if the given hostname matches (or is a subdomain of) + * a known third-party trust portal (SafeBase, Vanta, Drata, etc.). + * Used to gate the trust-portal deep-scrape pass: those portals are + * already handled well by the Firecrawl Agent, so we skip them. + */ +export function isKnownThirdPartyPortalHost(hostname: string): boolean { + const lower = hostname.toLowerCase(); + return TRUSTED_PORTAL_DOMAINS.some( + (portal) => lower === portal || lower.endsWith(`.${portal}`), + ); +}