diff --git a/.claude/settings.json b/.claude/settings.json index c3e573c..902bdd8 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -6,7 +6,8 @@ "Bash(pnpm typecheck:*)", "Bash(pnpm test:*)", "Bash(pnpm format:*)", - "Bash(pnpm --filter:*)" + "Bash(pnpm --filter:*)", + "Bash(gh api *)" ] } } diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6923e6d..93068a2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -101,7 +101,13 @@ jobs: - name: Setup uses: ./.github/actions/setup - - name: Deploy to Cloudflare Workers + - name: Deploy browser-scraper to Cloudflare Workers + run: pnpm --filter @repo/browser-scraper run deploy + env: + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} + CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + + - name: Deploy operator to Cloudflare Workers run: pnpm --filter @repo/operator run deploy env: CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} diff --git a/apps/browser-scraper/eslint.config.ts b/apps/browser-scraper/eslint.config.ts new file mode 100644 index 0000000..f57754c --- /dev/null +++ b/apps/browser-scraper/eslint.config.ts @@ -0,0 +1,5 @@ +/* eslint-disable import/no-default-export */ +import { baseConfig } from "@repo/eslint/base"; +import { defineConfig } from "eslint/config"; + +export default defineConfig(baseConfig); diff --git a/apps/browser-scraper/package.json b/apps/browser-scraper/package.json new file mode 100644 index 0000000..96e02a3 --- /dev/null +++ b/apps/browser-scraper/package.json @@ -0,0 +1,30 @@ +{ + "name": "@repo/browser-scraper", + "private": true, + "type": "module", + "scripts": { + "dev": "wrangler dev", + "deploy": "wrangler deploy", + "typecheck": "tsc", + "lint": "eslint .", + "test": "vitest run --passWithNoTests" + }, + "dependencies": { + "@cloudflare/playwright": "1.2.0", + "@repo/logger": "workspace:*", + "@repo/url-validator": "workspace:*", + "zod": "4.3.6" + }, + "devDependencies": { + "@cloudflare/workers-types": "4.20260412.1", + "@repo/eslint": "workspace:*", + "@repo/prettier": "workspace:*", + "@repo/typescript": "workspace:*", + "@types/node": "25.6.0", + "eslint": "9.39.1", + "prettier": "3.8.2", + "typescript": "6.0.2", + "vitest": "4.1.4", + "wrangler": "4.81.1" + } +} diff --git a/apps/browser-scraper/src/index.ts b/apps/browser-scraper/src/index.ts new file mode 100644 index 0000000..02fc1ea --- /dev/null +++ b/apps/browser-scraper/src/index.ts @@ -0,0 +1,64 @@ +import type { BrowserWorker } from "@cloudflare/playwright"; +import { Logger } from "@repo/logger"; +import { validateSourceUrl } from "@repo/url-validator"; +import { z } from "zod"; + +import { PlaywrightService } from "./services/playwright"; + +type Env = { BROWSER: BrowserWorker }; + +const requestSchema = z.object({ url: z.string() }); + +const handle = async (request: Request, env: Env): Promise => { + const logger = new Logger({ context: "browser-scraper" }); + + if (request.method !== "POST") { + return Response.json( + { ok: false, error: "Method not allowed" }, + { status: 405 } + ); + } + + let parsed: z.infer; + try { + const body: unknown = await request.json(); + parsed = requestSchema.parse(body); + } catch (error) { + return Response.json( + { + ok: false, + error: error instanceof Error ? error.message : "Invalid body", + }, + { status: 400 } + ); + } + + const check = validateSourceUrl(parsed.url); + if (!check.valid) { + return Response.json( + { ok: false, error: `Invalid URL: ${check.reason}` }, + { status: 400 } + ); + } + + try { + const result = await new PlaywrightService(env.BROWSER, logger).render( + parsed.url + ); + return Response.json(result); + } catch (error) { + logger.error("unexpected render failure", { + url: parsed.url, + errorMessage: error instanceof Error ? error.message : "Unknown error", + }); + return Response.json( + { ok: false, error: "Internal render failure" }, + { status: 500 } + ); + } +}; + +// eslint-disable-next-line import/no-default-export +export default { + fetch: handle, +}; diff --git a/apps/browser-scraper/src/services/playwright.test.ts b/apps/browser-scraper/src/services/playwright.test.ts new file mode 100644 index 0000000..76a5845 --- /dev/null +++ b/apps/browser-scraper/src/services/playwright.test.ts @@ -0,0 +1,117 @@ +import { Logger } from "@repo/logger"; +import { beforeEach, describe, expect, it, vi } from "vitest"; + +vi.mock("@cloudflare/playwright", () => ({ + launch: vi.fn(), +})); + +const { launch } = await import("@cloudflare/playwright"); +const { PlaywrightService } = await import("./playwright"); + +type GotoFn = () => Promise<{ + status: () => number; + headers: () => Record; +} | null>; + +const makeMockBrowser = ( + overrides: { + status?: number; + contentType?: string; + finalUrl?: string; + html?: string; + goto?: GotoFn; + } = {} +): { browser: unknown; closeMock: ReturnType } => { + const closeMock = vi.fn().mockResolvedValue(undefined); + const goto: GotoFn = + overrides.goto ?? + (() => + Promise.resolve({ + status: () => overrides.status ?? 200, + headers: () => ({ + "content-type": overrides.contentType ?? "text/html", + }), + })); + const page = { + route: vi.fn().mockResolvedValue(undefined), + goto: vi.fn(goto), + url: vi.fn().mockReturnValue(overrides.finalUrl ?? "https://example.com/"), + content: vi.fn().mockResolvedValue(overrides.html ?? ""), + }; + const browser = { + newPage: vi.fn().mockResolvedValue(page), + close: closeMock, + }; + return { browser, closeMock }; +}; + +describe("PlaywrightService.render", () => { + const logger = new Logger({ context: "test" }); + + beforeEach(() => { + vi.mocked(launch).mockReset(); + }); + + it("returns rendered html on a successful navigation", async () => { + const { browser, closeMock } = makeMockBrowser({ + html: "hello", + }); + vi.mocked(launch).mockResolvedValueOnce(browser as never); + + const result = await new PlaywrightService({} as never, logger).render( + "https://example.com/" + ); + + expect(result).toEqual({ + ok: true, + html: "hello", + finalUrl: "https://example.com/", + status: 200, + contentType: "text/html", + truncated: false, + }); + expect(closeMock).toHaveBeenCalledOnce(); + }); + + it("rejects when the post-redirect URL is unsafe", async () => { + const { browser, closeMock } = makeMockBrowser({ + finalUrl: "https://127.0.0.1/secrets", + }); + vi.mocked(launch).mockResolvedValueOnce(browser as never); + + const result = await new PlaywrightService({} as never, logger).render( + "https://example.com/" + ); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toMatch(/Unsafe final URL/); + } + expect(closeMock).toHaveBeenCalledOnce(); + }); + + it("returns an error when goto throws", async () => { + const { browser, closeMock } = makeMockBrowser({ + goto: () => Promise.reject(new Error("Navigation timeout")), + }); + vi.mocked(launch).mockResolvedValueOnce(browser as never); + + const result = await new PlaywrightService({} as never, logger).render( + "https://example.com/" + ); + + expect(result).toEqual({ ok: false, error: "Navigation timeout" }); + expect(closeMock).toHaveBeenCalledOnce(); + }); + + it("returns an error for non-2xx page status", async () => { + const { browser } = makeMockBrowser({ status: 404 }); + vi.mocked(launch).mockResolvedValueOnce(browser as never); + + const result = await new PlaywrightService({} as never, logger).render( + "https://example.com/missing" + ); + + expect(result).toEqual({ ok: false, error: "HTTP 404", status: 404 }); + }); +}); diff --git a/apps/browser-scraper/src/services/playwright.ts b/apps/browser-scraper/src/services/playwright.ts new file mode 100644 index 0000000..74bbb68 --- /dev/null +++ b/apps/browser-scraper/src/services/playwright.ts @@ -0,0 +1,102 @@ +import { launch } from "@cloudflare/playwright"; +import type { BrowserWorker } from "@cloudflare/playwright"; +import type { Logger } from "@repo/logger"; +import { validateSourceUrl } from "@repo/url-validator"; + +const NAV_TIMEOUT_MS = 15_000; +const MAX_HTML_CHARS = 2 * 1024 * 1024; + +type RenderSuccess = { + ok: true; + html: string; + finalUrl: string; + status: number; + contentType: string; + truncated: boolean; +}; +type RenderError = { ok: false; error: string; status?: number }; +type RenderResult = RenderSuccess | RenderError; + +class PlaywrightService { + constructor( + private readonly browser: BrowserWorker, + private readonly logger: Logger + ) {} + + async render(url: string): Promise { + const start = Date.now(); + const browser = await launch(this.browser); + try { + const page = await browser.newPage(); + + await page.route("**/*", (route) => { + const reqUrl = route.request().url(); + const check = validateSourceUrl(reqUrl); + if (!check.valid) { + this.logger.warn("blocked unsafe subresource", { + url: reqUrl, + reason: check.reason, + }); + return route.abort(); + } + return route.continue(); + }); + + let response; + try { + response = await page.goto(url, { + waitUntil: "domcontentloaded", + timeout: NAV_TIMEOUT_MS, + }); + } catch (error) { + return { + ok: false, + error: error instanceof Error ? error.message : "Navigation failed", + }; + } + + if (!response) { + return { ok: false, error: "No navigation response" }; + } + + const finalUrl = page.url(); + const finalCheck = validateSourceUrl(finalUrl); + if (!finalCheck.valid) { + return { + ok: false, + error: `Unsafe final URL after redirect: ${finalCheck.reason}`, + }; + } + + const status = response.status(); + if (status < 200 || status >= 300) { + return { ok: false, error: `HTTP ${String(status)}`, status }; + } + + const headers = response.headers(); + const contentType = headers["content-type"] ?? ""; + + const html = await page.content(); + const truncated = html.length > MAX_HTML_CHARS; + const finalHtml = truncated ? html.slice(0, MAX_HTML_CHARS) : html; + + return { + ok: true, + html: finalHtml, + finalUrl, + status, + contentType, + truncated, + }; + } finally { + await browser.close(); + this.logger.info("browser render finished", { + url, + browserDurationMs: Date.now() - start, + }); + } + } +} + +export { MAX_HTML_CHARS, NAV_TIMEOUT_MS, PlaywrightService }; +export type { RenderError, RenderResult, RenderSuccess }; diff --git a/apps/browser-scraper/tsconfig.json b/apps/browser-scraper/tsconfig.json new file mode 100644 index 0000000..3416688 --- /dev/null +++ b/apps/browser-scraper/tsconfig.json @@ -0,0 +1,8 @@ +{ + "extends": "@repo/typescript/base.json", + "compilerOptions": { + "types": ["@cloudflare/workers-types", "vitest/globals"] + }, + "include": ["src"], + "exclude": ["node_modules"] +} diff --git a/apps/browser-scraper/wrangler.jsonc b/apps/browser-scraper/wrangler.jsonc new file mode 100644 index 0000000..4d47c61 --- /dev/null +++ b/apps/browser-scraper/wrangler.jsonc @@ -0,0 +1,17 @@ +{ + "$schema": "node_modules/wrangler/config-schema.json", + "name": "switch-operator-browser-scraper", + "main": "src/index.ts", + "compatibility_date": "2026-03-29", + "compatibility_flags": ["nodejs_compat"], + "workers_dev": false, + "observability": { + "logs": { + "enabled": true, + "invocation_logs": true, + }, + }, + "browser": { + "binding": "BROWSER", + }, +} diff --git a/apps/operator/migrations/0005_nervous_scorpion.sql b/apps/operator/migrations/0005_nervous_scorpion.sql new file mode 100644 index 0000000..a480c7b --- /dev/null +++ b/apps/operator/migrations/0005_nervous_scorpion.sql @@ -0,0 +1 @@ +ALTER TABLE `schedules` ADD `use_browser` integer DEFAULT false NOT NULL; \ No newline at end of file diff --git a/apps/operator/migrations/meta/0005_snapshot.json b/apps/operator/migrations/meta/0005_snapshot.json new file mode 100644 index 0000000..e5f58bd --- /dev/null +++ b/apps/operator/migrations/meta/0005_snapshot.json @@ -0,0 +1,231 @@ +{ + "version": "6", + "dialect": "sqlite", + "id": "b69943e8-9429-48a9-b896-82adc44bd6d7", + "prevId": "258e161f-a05c-4114-9e45-2832a63ff0fa", + "tables": { + "pending_actions": { + "name": "pending_actions", + "columns": { + "chat_id": { + "name": "chat_id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "action_type": { + "name": "action_type", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "payload": { + "name": "payload", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "expires_at": { + "name": "expires_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "token": { + "name": "token", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "schedules": { + "name": "schedules", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "chat_id": { + "name": "chat_id", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "schedule_type": { + "name": "schedule_type", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "hour": { + "name": "hour", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "minute": { + "name": "minute", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false, + "default": 0 + }, + "day_of_week": { + "name": "day_of_week", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "day_of_month": { + "name": "day_of_month", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "timezone": { + "name": "timezone", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": "'UTC'" + }, + "fixed_message": { + "name": "fixed_message", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "message_prompt": { + "name": "message_prompt", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "source_url": { + "name": "source_url", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "keywords": { + "name": "keywords", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "state_json": { + "name": "state_json", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "active": { + "name": "active", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": true + }, + "use_browser": { + "name": "use_browser", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": false + }, + "next_run_at": { + "name": "next_run_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "claimed_at": { + "name": "claimed_at", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "retry_count": { + "name": "retry_count", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": 0 + }, + "created_at": { + "name": "created_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "idx_schedules_next_run": { + "name": "idx_schedules_next_run", + "columns": ["active", "next_run_at"], + "isUnique": false + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + } + }, + "views": {}, + "enums": {}, + "_meta": { + "schemas": {}, + "tables": {}, + "columns": {} + }, + "internal": { + "indexes": {} + } +} diff --git a/apps/operator/migrations/meta/_journal.json b/apps/operator/migrations/meta/_journal.json index 89ca5b6..d65369f 100644 --- a/apps/operator/migrations/meta/_journal.json +++ b/apps/operator/migrations/meta/_journal.json @@ -36,6 +36,13 @@ "when": 1777190805045, "tag": "0004_minor_rictor", "breakpoints": true + }, + { + "idx": 5, + "version": "6", + "when": 1777491867371, + "tag": "0005_nervous_scorpion", + "breakpoints": true } ] } diff --git a/apps/operator/package.json b/apps/operator/package.json index e099c45..3091a03 100644 --- a/apps/operator/package.json +++ b/apps/operator/package.json @@ -17,6 +17,7 @@ "@hono/zod-validator": "0.7.6", "@repo/http-client": "workspace:*", "@repo/logger": "workspace:*", + "@repo/url-validator": "workspace:*", "drizzle-orm": "0.45.2", "hono": "4.12.12", "node-html-markdown": "2.0.0", diff --git a/apps/operator/src/db/schema.ts b/apps/operator/src/db/schema.ts index 9b3eccf..8ba75ec 100644 --- a/apps/operator/src/db/schema.ts +++ b/apps/operator/src/db/schema.ts @@ -22,6 +22,9 @@ const schedules = sqliteTable( stateJson: text("state_json"), description: text("description").notNull(), active: int("active", { mode: "boolean" }).notNull().default(true), + useBrowser: int("use_browser", { mode: "boolean" }) + .notNull() + .default(false), nextRunAt: text("next_run_at").notNull(), claimedAt: text("claimed_at"), retryCount: int("retry_count").notNull().default(0), diff --git a/apps/operator/src/modules/telegram/controller.ts b/apps/operator/src/modules/telegram/controller.ts index 8111219..8110af7 100644 --- a/apps/operator/src/modules/telegram/controller.ts +++ b/apps/operator/src/modules/telegram/controller.ts @@ -23,7 +23,7 @@ import { TELEGRAM_HTML_SAFE_LENGTH, TELEGRAM_MAX_MESSAGE_LENGTH, } from "../../utils/message"; -import { validateSourceUrl } from "../../utils/url-validator"; +import { validateSourceUrl } from "@repo/url-validator"; const DAYS = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]; diff --git a/apps/operator/src/scheduled.ts b/apps/operator/src/scheduled.ts index 3aac59c..1ca3564 100644 --- a/apps/operator/src/scheduled.ts +++ b/apps/operator/src/scheduled.ts @@ -16,7 +16,7 @@ import { TELEGRAM_HTML_SAFE_LENGTH, TELEGRAM_MAX_MESSAGE_LENGTH, } from "./utils/message"; -import { validateSourceUrl } from "./utils/url-validator"; +import { validateSourceUrl } from "@repo/url-validator"; type Env = AppEnv["Bindings"]; @@ -84,7 +84,10 @@ const handleScheduled = async ( return; } - const scrapeResult = await scrapeUrl(schedule.sourceUrl); + const scrapeResult = await scrapeUrl(schedule.sourceUrl, { + browserScraper: env.BROWSER_SCRAPER, + useBrowser: schedule.useBrowser, + }); if (!scrapeResult.ok) { throw new Error(`Scrape failed: ${scrapeResult.error}`); } diff --git a/apps/operator/src/services/scrape.test.ts b/apps/operator/src/services/scrape.test.ts index b121c64..90cc155 100644 --- a/apps/operator/src/services/scrape.test.ts +++ b/apps/operator/src/services/scrape.test.ts @@ -148,7 +148,9 @@ describe("scrapeUrl", () => { vi.fn().mockResolvedValueOnce(createMockResponse(longHtml)) ); - const result = await scrapeUrl("https://example.com", 100); + const result = await scrapeUrl("https://example.com", { + maxTextLength: 100, + }); expect(result.ok).toBe(true); if (result.ok) { @@ -273,6 +275,134 @@ describe("scrapeUrl", () => { }); }); +describe("scrapeUrl via browserScraper", () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + const makeFetcher = (response: Response): Fetcher => + ({ fetch: vi.fn().mockResolvedValue(response) }) as unknown as Fetcher; + + it("returns markdown from a successful browser-scraper response", async () => { + const browserScraper = makeFetcher( + Response.json({ + ok: true, + html: "

Hello SPA

", + finalUrl: "https://example.com/", + status: 200, + contentType: "text/html", + truncated: false, + }) + ); + + const result = await scrapeUrl("https://example.com", { + browserScraper, + useBrowser: true, + }); + + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.text).toContain("Hello SPA"); + } + }); + + it("propagates truncated=true from browser-scraper even when text fits within maxTextLength", async () => { + const browserScraper = makeFetcher( + Response.json({ + ok: true, + html: "

short content

", + finalUrl: "https://example.com/", + status: 200, + contentType: "text/html", + truncated: true, + }) + ); + + const result = await scrapeUrl("https://example.com", { + browserScraper, + useBrowser: true, + }); + + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.truncated).toBe(true); + } + }); + + it("propagates error when browser-scraper returns ok:false without status", async () => { + const browserScraper = makeFetcher( + Response.json({ ok: false, error: "Navigation timeout" }) + ); + + const result = await scrapeUrl("https://example.com", { + browserScraper, + useBrowser: true, + }); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toBe("Navigation timeout"); + } + }); + + it("maps browser-scraper status through ERROR_MAP", async () => { + const browserScraper = makeFetcher( + Response.json({ ok: false, error: "HTTP 403", status: 403 }) + ); + + const result = await scrapeUrl("https://example.com", { + browserScraper, + useBrowser: true, + }); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toBe("Blocked by site"); + expect(result.statusCode).toBe(403); + } + }); + + it("returns error when browser-scraper returns invalid JSON", async () => { + const browserScraper = makeFetcher( + new Response("not-json", { + status: 500, + headers: { "content-type": "text/plain" }, + }) + ); + + const result = await scrapeUrl("https://example.com", { + browserScraper, + useBrowser: true, + }); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toMatch(/invalid JSON/); + } + }); + + it("uses native fetch when useBrowser is false even if browserScraper is provided", async () => { + const fetchMock = vi + .fn() + .mockResolvedValueOnce(createMockResponse("

fetched

")); + vi.stubGlobal("fetch", fetchMock); + + const browserScraperFetch = vi.fn(); + const browserScraper = { + fetch: browserScraperFetch, + } as unknown as Fetcher; + + const result = await scrapeUrl("https://example.com", { + browserScraper, + useBrowser: false, + }); + + expect(result.ok).toBe(true); + expect(fetchMock).toHaveBeenCalledOnce(); + expect(browserScraperFetch).not.toHaveBeenCalled(); + }); +}); + describe("convertContent", () => { it("converts HTML to markdown", () => { const result = convertContent( diff --git a/apps/operator/src/services/scrape.ts b/apps/operator/src/services/scrape.ts index b033c42..5455678 100644 --- a/apps/operator/src/services/scrape.ts +++ b/apps/operator/src/services/scrape.ts @@ -1,11 +1,31 @@ +import { validateSourceUrl } from "@repo/url-validator"; import { NodeHtmlMarkdown } from "node-html-markdown"; -import { validateSourceUrl } from "../utils/url-validator"; - type ScrapeOk = { ok: true; text: string; truncated: boolean }; type ScrapeError = { ok: false; error: string; statusCode?: number }; type ScrapeResult = ScrapeOk | ScrapeError; +type ScrapeOptions = { + maxTextLength?: number; + browserScraper?: Fetcher; + useBrowser?: boolean; +}; + +type FetchedContent = + | { ok: true; raw: string; contentType: string; truncated?: boolean } + | { ok: false; error: string; statusCode?: number }; + +type BrowserScraperResponse = + | { + ok: true; + html: string; + finalUrl: string; + status: number; + contentType: string; + truncated: boolean; + } + | { ok: false; error: string; status?: number }; + const MAX_BODY_BYTES = 2 * 1024 * 1024; // 2 MB const DEFAULT_MAX_TEXT_LENGTH = 80_000; const FETCH_TIMEOUT_MS = 15_000; @@ -139,10 +159,7 @@ const fetchWithSafeRedirects = async ( throw new Error("Too many redirects"); }; -const scrapeUrl = async ( - url: string, - maxTextLength = DEFAULT_MAX_TEXT_LENGTH -): Promise => { +const fetchViaNative = async (url: string): Promise => { let response: Response; try { response = await fetchWithSafeRedirects(url); @@ -179,17 +196,84 @@ const scrapeUrl = async ( const raw = new TextDecoder().decode(bodyResult.bytes); const contentType = response.headers.get("content-type") ?? ""; - const converted = convertContent(raw, contentType); + return { ok: true, raw, contentType }; +}; + +const fetchViaBrowserScraper = async ( + browserScraper: Fetcher, + url: string +): Promise => { + let response: Response; + try { + response = await browserScraper.fetch("https://browser-scraper/", { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ url }), + }); + } catch (error) { + return { + ok: false, + error: + error instanceof Error ? error.message : "Browser scraper unreachable", + }; + } + + let body: BrowserScraperResponse; + try { + body = await response.json(); + } catch { + return { + ok: false, + error: `Browser scraper returned invalid JSON (HTTP ${String(response.status)})`, + }; + } + + if (!body.ok) { + const status = body.status; + const mapped = status !== undefined ? ERROR_MAP[status] : undefined; + return status !== undefined + ? { ok: false, error: mapped ?? body.error, statusCode: status } + : { ok: false, error: body.error }; + } + + return { + ok: true, + raw: body.html, + contentType: body.contentType, + truncated: body.truncated, + }; +}; + +const scrapeUrl = async ( + url: string, + options: ScrapeOptions = {} +): Promise => { + const maxTextLength = options.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH; + + const fetched = + options.useBrowser === true && options.browserScraper + ? await fetchViaBrowserScraper(options.browserScraper, url) + : await fetchViaNative(url); + + if (!fetched.ok) { + return fetched; + } + + const converted = convertContent(fetched.raw, fetched.contentType); if ("error" in converted) { return { ok: false, error: converted.error }; } - const truncated = converted.text.length > maxTextLength; - const text = truncated + const textTruncated = converted.text.length > maxTextLength; + const text = textTruncated ? converted.text.slice(0, maxTextLength) : converted.text; - return { ok: true, text, truncated }; + return { + ok: true, + text, + truncated: textTruncated || fetched.truncated === true, + }; }; export { @@ -202,4 +286,4 @@ export { scrapeUrl, USER_AGENT, }; -export type { ScrapeResult }; +export type { ScrapeOptions, ScrapeResult }; diff --git a/apps/operator/src/types/env.ts b/apps/operator/src/types/env.ts index 93b0b84..a90ba13 100644 --- a/apps/operator/src/types/env.ts +++ b/apps/operator/src/types/env.ts @@ -11,6 +11,7 @@ const envSchema = z.object({ type AppEnv = { Bindings: z.infer & { DB: D1Database; + BROWSER_SCRAPER: Fetcher; }; Variables: { logger: Logger; diff --git a/apps/operator/wrangler.jsonc b/apps/operator/wrangler.jsonc index 931f8fe..a830e9c 100644 --- a/apps/operator/wrangler.jsonc +++ b/apps/operator/wrangler.jsonc @@ -18,6 +18,12 @@ "migrations_dir": "migrations", }, ], + "services": [ + { + "binding": "BROWSER_SCRAPER", + "service": "switch-operator-browser-scraper", + }, + ], "triggers": { "crons": ["* * * * *"], }, diff --git a/packages/logger/src/__tests__/logger.test.ts b/packages/logger/src/__tests__/logger.test.ts index 88d9c87..c89a0e9 100644 --- a/packages/logger/src/__tests__/logger.test.ts +++ b/packages/logger/src/__tests__/logger.test.ts @@ -61,6 +61,32 @@ describe("Logger", () => { expect(entry.statusCode).toBe(200); }); + it("redacts sensitive top-level metadata keys (case-insensitive)", () => { + const logger = new Logger({ context: "api" }); + logger.info("call", { + authorization: "Bearer secret-token", + Authorization: "Bearer also-secret", + api_key: "sk-abc123", + APIKey: "sk-def456", + token: "tok-xyz", + secret: "shh", + password: "hunter2", + cookie: "session=abc", + method: "GET", + }); + + const entry = parseLogEntry(consoleSpy.log) as Record; + expect(entry.authorization).toBe("[redacted]"); + expect(entry.Authorization).toBe("[redacted]"); + expect(entry.api_key).toBe("[redacted]"); + expect(entry.APIKey).toBe("[redacted]"); + expect(entry.token).toBe("[redacted]"); + expect(entry.secret).toBe("[redacted]"); + expect(entry.password).toBe("[redacted]"); + expect(entry.cookie).toBe("[redacted]"); + expect(entry.method).toBe("GET"); + }); + it("does not allow metadata to override reserved fields", () => { const logger = new Logger({ context: "api" }); logger.warn("actual", { diff --git a/packages/logger/src/logger.ts b/packages/logger/src/logger.ts index f4ea1b2..a827047 100644 --- a/packages/logger/src/logger.ts +++ b/packages/logger/src/logger.ts @@ -16,6 +16,26 @@ const CONSOLE_METHOD: Record = { error: "error", }; +const REDACTED_KEYS = new Set([ + "authorization", + "api_key", + "apikey", + "token", + "secret", + "password", + "cookie", +]); + +const REDACTED = "[redacted]"; + +const redactMetadata = (metadata: LogMetadata): LogMetadata => { + const out: LogMetadata = {}; + for (const [key, value] of Object.entries(metadata)) { + out[key] = REDACTED_KEYS.has(key.toLowerCase()) ? REDACTED : value; + } + return out; +}; + export class Logger { private readonly context: string; private readonly minLevel: LogLevel; @@ -55,7 +75,7 @@ export class Logger { } const entry: LogEntry = { - ...(metadata ?? {}), + ...(metadata ? redactMetadata(metadata) : {}), timestamp: new Date().toISOString(), level, context: this.context, diff --git a/packages/url-validator/eslint.config.ts b/packages/url-validator/eslint.config.ts new file mode 100644 index 0000000..f57754c --- /dev/null +++ b/packages/url-validator/eslint.config.ts @@ -0,0 +1,5 @@ +/* eslint-disable import/no-default-export */ +import { baseConfig } from "@repo/eslint/base"; +import { defineConfig } from "eslint/config"; + +export default defineConfig(baseConfig); diff --git a/packages/url-validator/package.json b/packages/url-validator/package.json new file mode 100644 index 0000000..a0ac3cc --- /dev/null +++ b/packages/url-validator/package.json @@ -0,0 +1,25 @@ +{ + "name": "@repo/url-validator", + "version": "1.0.0", + "private": true, + "type": "module", + "exports": { + ".": "./src/url-validator.ts" + }, + "scripts": { + "typecheck": "tsc --noEmit", + "lint": "eslint .", + "test": "vitest run --passWithNoTests", + "format": "prettier --check . --ignore-path ../../.gitignore" + }, + "devDependencies": { + "@repo/eslint": "workspace:*", + "@repo/prettier": "workspace:*", + "@repo/typescript": "workspace:*", + "eslint": "9.39.1", + "prettier": "3.8.2", + "typescript": "6.0.2", + "vitest": "4.1.4" + }, + "prettier": "@repo/prettier" +} diff --git a/apps/operator/src/utils/url-validator.test.ts b/packages/url-validator/src/url-validator.test.ts similarity index 100% rename from apps/operator/src/utils/url-validator.test.ts rename to packages/url-validator/src/url-validator.test.ts diff --git a/apps/operator/src/utils/url-validator.ts b/packages/url-validator/src/url-validator.ts similarity index 100% rename from apps/operator/src/utils/url-validator.ts rename to packages/url-validator/src/url-validator.ts diff --git a/packages/url-validator/tsconfig.json b/packages/url-validator/tsconfig.json new file mode 100644 index 0000000..7b05454 --- /dev/null +++ b/packages/url-validator/tsconfig.json @@ -0,0 +1,8 @@ +{ + "extends": "@repo/typescript/base.json", + "compilerOptions": { + "types": ["vitest/globals"] + }, + "include": ["src", "*.ts"], + "exclude": ["node_modules"] +} diff --git a/packages/url-validator/vitest.config.ts b/packages/url-validator/vitest.config.ts new file mode 100644 index 0000000..ab8f85c --- /dev/null +++ b/packages/url-validator/vitest.config.ts @@ -0,0 +1,8 @@ +/* eslint-disable import/no-default-export */ +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + globals: true, + }, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 2c462b3..b63f3bf 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -21,6 +21,52 @@ importers: specifier: 0.7.2 version: 0.7.2(@ianvs/prettier-plugin-sort-imports@4.7.1(prettier@3.8.2))(prettier@3.8.2) + apps/browser-scraper: + dependencies: + '@cloudflare/playwright': + specifier: 1.2.0 + version: 1.2.0 + '@repo/logger': + specifier: workspace:* + version: link:../../packages/logger + '@repo/url-validator': + specifier: workspace:* + version: link:../../packages/url-validator + zod: + specifier: 4.3.6 + version: 4.3.6 + devDependencies: + '@cloudflare/workers-types': + specifier: 4.20260412.1 + version: 4.20260412.1 + '@repo/eslint': + specifier: workspace:* + version: link:../../tooling/eslint + '@repo/prettier': + specifier: workspace:* + version: link:../../tooling/prettier + '@repo/typescript': + specifier: workspace:* + version: link:../../tooling/typescript + '@types/node': + specifier: 25.6.0 + version: 25.6.0 + eslint: + specifier: 9.39.1 + version: 9.39.1(jiti@2.6.1) + prettier: + specifier: 3.8.2 + version: 3.8.2 + typescript: + specifier: 6.0.2 + version: 6.0.2 + vitest: + specifier: 4.1.4 + version: 4.1.4(@types/node@25.6.0)(vite@8.0.3(@emnapi/core@1.9.1)(@emnapi/runtime@1.9.2)(@types/node@25.6.0)(esbuild@0.27.3)(jiti@2.6.1)(tsx@4.21.0)) + wrangler: + specifier: 4.81.1 + version: 4.81.1(@cloudflare/workers-types@4.20260412.1) + apps/operator: dependencies: '@hono/zod-validator': @@ -32,6 +78,9 @@ importers: '@repo/logger': specifier: workspace:* version: link:../../packages/logger + '@repo/url-validator': + specifier: workspace:* + version: link:../../packages/url-validator drizzle-orm: specifier: 0.45.2 version: 0.45.2(@cloudflare/workers-types@4.20260412.1) @@ -140,6 +189,30 @@ importers: specifier: 4.1.4 version: 4.1.4(@types/node@25.6.0)(vite@8.0.3(@emnapi/core@1.9.1)(@emnapi/runtime@1.9.2)(@types/node@25.6.0)(esbuild@0.27.3)(jiti@2.6.1)(tsx@4.21.0)) + packages/url-validator: + devDependencies: + '@repo/eslint': + specifier: workspace:* + version: link:../../tooling/eslint + '@repo/prettier': + specifier: workspace:* + version: link:../../tooling/prettier + '@repo/typescript': + specifier: workspace:* + version: link:../../tooling/typescript + eslint: + specifier: 9.39.1 + version: 9.39.1(jiti@2.6.1) + prettier: + specifier: 3.8.2 + version: 3.8.2 + typescript: + specifier: 6.0.2 + version: 6.0.2 + vitest: + specifier: 4.1.4 + version: 4.1.4(@types/node@25.6.0)(vite@8.0.3(@emnapi/core@1.9.1)(@emnapi/runtime@1.9.2)(@types/node@25.6.0)(esbuild@0.27.3)(jiti@2.6.1)(tsx@4.21.0)) + tooling/eslint: dependencies: '@eslint/compat': @@ -279,6 +352,9 @@ packages: resolution: {integrity: sha512-SIOD2DxrRRwQ+jgzlXCqoEFiKOFqaPjhnNTGKXSRLvp1HiOvapLaFG2kEr9dYQTYe8rKrd9uvDUzmAITeNyaHQ==} engines: {node: '>=18.0.0'} + '@cloudflare/playwright@1.2.0': + resolution: {integrity: sha512-+LQo135uWxf0M2nG4T05T0FNH9Qhif7Uq2AhT79hX/FuoTLORX+OmOshl+cObTderrRQ4wkq1AXrUBq3w2eubg==} + '@cloudflare/unenv-preset@2.16.0': resolution: {integrity: sha512-8ovsRpwzPoEqPUzoErAYVv8l3FMZNeBVQfJTvtzP4AgLSRGZISRfuChFxHWUQd3n6cnrwkuTGxT+2cGo8EsyYg==} peerDependencies: @@ -2896,6 +2972,8 @@ snapshots: '@cloudflare/kv-asset-handler@0.4.2': {} + '@cloudflare/playwright@1.2.0': {} + '@cloudflare/unenv-preset@2.16.0(unenv@2.0.0-rc.24)(workerd@1.20260409.1)': dependencies: unenv: 2.0.0-rc.24