https://chatgpt.com/canvas/shared/68ebf7c973cc81918680d700eb83d3a4
High‑End Web Crawler/Scraper + UI (TypeScript)\n\nTarget stack: Cloudflare Workers (edge) + Hono, Puppeteer via Workers Browser Rendering, Queues + Durable Objects, React 19 + Vite + shadcn/ui.\n\n> ⚖️ Note: Directly scraping Google Search pages may violate Google’s ToS.
This starter ships two providers: a compliant default (Google Programmable Search JSON API) and an experimental SERP scraper behind a feature flag. Use the API provider in production.\n\n---\n\n## Monorepo layout\n\n```
apps/
worker/ # Cloudflare Worker API, crawlers, queues
src/
index.ts # Hono app, routes
providers/
googleProgrammableSearch.ts
googleSerpScraper.ts # experimental (feature-flagged)
crawl/
crawlUrls.ts
extractors/
schemaOrg.ts
generic.ts
rate-limit/
limiter.ts # Durable Objects rate limiter
wrangler.toml
package.json
tsconfig.json
web/ # React 19 + Vite + shadcn/ui
src/
main.tsx
App.tsx
components/
SearchForm.tsx
ResultsTable.tsx
JobSidebar.tsx
lib/
api.ts
useJob.ts
index.html
package.json
tsconfig.json
packages/
shared/
src/types.ts
package.json
\n---\n\n## Cloudflare Worker — `apps/worker/src/index.ts`\n\n```ts
// apps/worker/src/index.ts
import { Hono } from 'hono'
import { cors } from 'hono/cors'
import { prettyJSON } from 'hono/pretty-json'
import { googleProgrammableSearch } from './providers/googleProgrammableSearch'
import { googleSerpScraper } from './providers/googleSerpScraper' // experimental
import { enqueueCrawl } from './crawl/crawlUrls'
import type { Env, SearchRequest, SearchResponse } from 'shared/types'
const app = new Hono<{ Bindings: Env }>()
app.use('*', cors())
app.use('*', prettyJSON())
app.get('/health', (c) => c.text('ok'))
// Kick off a search job (Google provider + optional follow-up crawling)
app.post('/api/search', async (c) => {
const body = (await c.req.json()) as SearchRequest
const provider = body.provider ?? 'programmable'
const query = body.query?.trim()
if (!query) return c.json({ error: 'Missing query' }, 400)
let results: SearchResponse
if (provider === 'programmable') {
results = await googleProgrammableSearch(c.env, { query, page: body.page ?? 1, region: body.region })
} else {
// experimental: requires `BROWSER` binding in wrangler.toml
results = await googleSerpScraper(c.env, { query, page: body.page ?? 1, region: body.region })
}
// Optionally queue deep-crawling of result URLs for structured data enrichment
if (body.deepCrawl) {
await enqueueCrawl(c.env, results.items.map((i) => i.url))
}
return c.json(results)
})
// Ingestor for the crawl queue (consumer)
export default {
async fetch(request: Request, env: Env, ctx: ExecutionContext) {
return app.fetch(request, env, ctx)
},
async queue(batch: MessageBatch<string>, env: Env, ctx: ExecutionContext) {
for (const msg of batch.messages) {
try {
const { crawlAndExtract } = await import('./crawl/crawlUrls')
const enriched = await crawlAndExtract(env, msg.body)
// TODO: persist enriched data (KV/D1/R2) or forward to webhook
console.log('Enriched', enriched.url)
msg.ack()
} catch (e) {
console.error('Crawl failed', e)
msg.retry()
}
}
},
}
\n---\n\n## Google Provider (compliant default) — googleProgrammableSearch.ts\n\n```ts
// apps/worker/src/providers/googleProgrammableSearch.ts
import type { Env, SearchResponse } from 'shared/types'
const ENDPOINT = 'https://www.googleapis.com/customsearch/v1'
export async function googleProgrammableSearch(env: Env, {
query,
page = 1,
region,
}: { query: string; page?: number; region?: string }): Promise {
const start = (page - 1) * 10 + 1
const u = new URL(ENDPOINT)
u.searchParams.set('key', env.GOOGLE_API_KEY)
u.searchParams.set('cx', env.GOOGLE_CSE_ID)
u.searchParams.set('q', query)
u.searchParams.set('start', String(start))
if (region) u.searchParams.set('gl', region) // country code; optional
const res = await fetch(u.toString())
if (!res.ok) throw new Error(Google API error: ${res.status})
const json = await res.json()
const items = (json.items ?? []).map((it: any) => ({
title: it.title as string,
url: it.link as string,
snippet: it.snippet as string,
source: 'google:cse' as const,
}))
return { provider: 'programmable', query, page, items, total: json.searchInformation?.totalResults ? Number(json.searchInformation.totalResults) : items.length }
}
\n---\n\n## Experimental SERP Scraper (feature‑flag) — `googleSerpScraper.ts`\n\n> Requires **Workers Browser Rendering** (`BROWSER` binding) and abides by your own compliance policy. Backoff + rate‑limit via Durable Objects is strongly recommended.\n\n```ts
// apps/worker/src/providers/googleSerpScraper.ts
import type { Env, SearchResponse } from 'shared/types'
export async function googleSerpScraper(env: Env, {
query, page = 1, region,
}: { query: string; page?: number; region?: string }): Promise<SearchResponse> {
const puppeteer = await import('@cloudflare/puppeteer')
// region code influences Google locale via hl/gl query params
const q = new URL('https://www.google.com/search')
q.searchParams.set('q', query)
if (region) q.searchParams.set('gl', region)
if (page > 1) q.searchParams.set('start', String((page - 1) * 10))
const browser = await puppeteer.launch(env.BROWSER)
const pageObj = await browser.newPage()
await pageObj.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36')
await pageObj.goto(q.toString(), { waitUntil: 'networkidle0' })
const items = await pageObj.$$eval('a[href^="http"] h3', (hs) => {
return hs.map((h) => {
const a = h.closest('a') as HTMLAnchorElement | null
const title = (h as HTMLElement).innerText
const url = a?.href || ''
const snippetEl = a?.parentElement?.parentElement?.querySelector('div[data-sncf]') as HTMLElement | null
const snippet = snippetEl?.innerText || ''
return { title, url, snippet, source: 'google:html' as const }
})
})
await browser.close()
// filter obvious Google redirectors or tracking params
const cleaned = items.filter(i => i.url && !i.url.includes('/search?') && !i.url.includes('googleusercontent.com'))
return { provider: 'serp-scraper', query, page, items: cleaned, total: cleaned.length }
}
\n---\n\n## Crawl & Extract — crawl/crawlUrls.ts\n\n```ts
// apps/worker/src/crawl/crawlUrls.ts
import { HTMLRewriter } from 'html-rewriter'
import type { Env, EnrichedPage } from 'shared/types'
import { parseSchemaOrgJsonLd } from './extractors/schemaOrg'
export async function enqueueCrawl(env: Env, urls: string[]) {
for (const u of urls) await env.CRAWL_QUEUE.send(u)
}
export async function crawlAndExtract(env: Env, url: string): Promise {
const res = await fetch(url, { headers: { 'User-Agent': 'LeverageAI-Crawler/1.0 (+https://example.com/bot)' } })
const contentType = res.headers.get('content-type') || ''
const html = await res.text()
// Extract with JSON-LD first (LocalBusiness, Organization, etc.)
const structured = parseSchemaOrgJsonLd(html)
// Lightweight extraction with HTMLRewriter for title/phone/email fallbacks
let title = ''
let email = ''
const rewriter = new HTMLRewriter()
.on('title', { text(t) { title += t.text } })
.on('a[href^="mailto:"]', { element(el) { const m = el.getAttribute('href')?.replace('mailto:', '') || ''; if (!email) email = m } })
await rewriter.transform(new Response(html, { headers: { 'content-type': contentType } })).text()
return { url, title: title.trim(), email, structured }
}
\n---\n\n## Schema.org JSON‑LD Parser — `extractors/schemaOrg.ts`\n\n```ts
// apps/worker/src/crawl/extractors/schemaOrg.ts
import type { StructuredData } from 'shared/types'
export function parseSchemaOrgJsonLd(html: string): StructuredData[] {
const scripts = [...html.matchAll(/<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi)]
const out: StructuredData[] = []
for (const [, json] of scripts) {
try {
const data = JSON.parse(json)
const arr = Array.isArray(data) ? data : [data]
for (const d of arr) out.push({
'@type': d['@type'],
name: d.name, telephone: d.telephone, url: d.url, address: d.address, sameAs: d.sameAs,
})
} catch {}
}
return out
}
\n---\n\n## Durable Objects Rate Limiter — rate-limit/limiter.ts\n\n```ts
// apps/worker/src/rate-limit/limiter.ts
export class Limiter {
state: DurableObjectState
constructor(state: DurableObjectState) { this.state = state }
async fetch(req: Request) {
const { key, limit = 30, window = 60 } = await req.json()
const now = Date.now()
const bucket = await this.state.storage.get<{ count: number; reset: number }>(key) || { count: 0, reset: now + window * 1000 }
if (now > bucket.reset) { bucket.count = 0; bucket.reset = now + window * 1000 }
if (++bucket.count > limit) return new Response('rate_limited', { status: 429 })
await this.state.storage.put(key, bucket)
return new Response(JSON.stringify({ remaining: limit - bucket.count, reset: bucket.reset }), { headers: { 'content-type': 'application/json' } })
}
}
\n---\n\n## Worker bindings — `apps/worker/wrangler.toml`\n\n```toml
name = "leverageai-crawler"
main = "src/index.ts"
compatibility_date = "2025-09-15"
[vars]
GOOGLE_CSE_ID = "replaceme"
GOOGLE_API_KEY = "replaceme"
[[durable_objects.bindings]]
name = "RATE_LIMITER"
class_name = "Limiter"
[[queues.producers]]
queue = "CRAWL_QUEUE"
binding = "CRAWL_QUEUE"
[[queues.consumers]]
queue = "CRAWL_QUEUE"
[browser]
binding = "BROWSER" # required for experimental SERP scraper
\n---\n\n## Shared types — packages/shared/src/types.ts\n\n```ts
// packages/shared/src/types.ts
export type Env = {
GOOGLE_API_KEY: string
GOOGLE_CSE_ID: string
CRAWL_QUEUE: Queue
BROWSER: any
}
export type SearchRequest = {
query: string
page?: number
region?: string
provider?: 'programmable' | 'serp-scraper'
deepCrawl?: boolean
}
export type SearchItem = { title: string; url: string; snippet?: string; source: 'google:cse' | 'google:html' }
export type SearchResponse = { provider: string; query: string; page: number; items: SearchItem[]; total: number }
export type StructuredData = { '@type'?: string; name?: string; telephone?: string; url?: string; address?: unknown; sameAs?: string[] }
export type EnrichedPage = { url: string; title?: string; email?: string; structured?: StructuredData[] }
\n---\n\n## React 19 + shadcn UI — `apps/web/src/App.tsx`\n\n```tsx
// apps/web/src/App.tsx
import { useState } from 'react'
import { SearchForm } from './components/SearchForm'
import { ResultsTable } from './components/ResultsTable'
import { JobSidebar } from './components/JobSidebar'
export default function App() {
const [rows, setRows] = useState<any[]>([])
return (
<div className="min-h-svh grid grid-cols-12 gap-6 p-6 bg-background text-foreground">
<div className="col-span-8 space-y-6">
<SearchForm onResults={setRows} />
<ResultsTable rows={rows} />
</div>
<div className="col-span-4">
<JobSidebar />
</div>
</div>
)
}
\n---\n\n### SearchForm.tsx
import { useState, useTransition } from 'react'
import { Input } from '@/components/ui/input'
import { Button } from '@/components/ui/button'
import { Label } from '@/components/ui/label'
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
import { api } from '../lib/api'
export function SearchForm({ onResults }: { onResults: (rows: any[]) => void }) {
const [query, setQuery] = useState('Best Electricians in Southern Oregon')
const [deepCrawl, setDeepCrawl] = useState(true)
const [isPending, startTransition] = useTransition()
return (
<Card>
<CardHeader><CardTitle>Google Search + Crawl</CardTitle></CardHeader>
<CardContent className="space-y-3">
<Label htmlFor="q">Query</Label>
<Input id="q" value={query} onChange={(e) => setQuery(e.target.value)} />
<div className="flex items-center gap-2">
<input id="deep" type="checkbox" checked={deepCrawl} onChange={(e) => setDeepCrawl(e.target.checked)} />
<Label htmlFor="deep">Deep crawl result sites (enrich with schema.org)</Label>
</div>
<div className="flex gap-2">
<Button disabled={isPending} onClick={() => startTransition(async () => {
const res = await api.search({ query, deepCrawl, provider: 'programmable' })
onResults(res.items)
})}>Search</Button>
<Button variant="secondary" disabled={isPending} onClick={() => startTransition(async () => {
const res = await api.search({ query, deepCrawl, provider: 'serp-scraper' })
onResults(res.items)
})}>Try Experimental Scraper</Button>
</div>
</CardContent>
</Card>
)
}
\n---\n\n### ResultsTable.tsx
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
export function ResultsTable({ rows }: { rows: any[] }) {
return (
<Card>
<CardHeader><CardTitle>Results ({rows.length})</CardTitle></CardHeader>
<CardContent>
<div className="grid gap-3">
{rows.map((r, i) => (
<a key={i} href={r.url} target="_blank" className="p-3 rounded-xl border hover:bg-muted">
<div className="font-medium">{r.title}</div>
<div className="text-sm text-muted-foreground break-all">{r.url}</div>
{r.snippet && <div className="text-sm mt-1">{r.snippet}</div>}
</a>
))}
</div>
</CardContent>
</Card>
)
}
\n---\n\n### JobSidebar.tsx
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
export function JobSidebar() {
return (
<Card>
<CardHeader><CardTitle>Job Status</CardTitle></CardHeader>
<CardContent className="text-sm text-muted-foreground">
Deep crawl jobs are queued and processed at the edge. Refresh to see enriched data in your store.
</CardContent>
</Card>
)
}
\n---\n\n### Simple API helper — apps/web/src/lib/api.ts
export const api = {
async search(payload: any) {
const base = import.meta.env.VITE_API_BASE || '/'
const r = await fetch(base + 'api/search', { method: 'POST', headers: { 'content-type': 'application/json' }, body: JSON.stringify(payload) })
if (!r.ok) throw new Error('API error')
return r.json()
},
}
\n---\n\n## Vite + shadcn setup (web)
npm i -D vite typescript tailwindcss postcss autoprefixernpm i react react-domInstall shadcn/ui and components (Button, Card, Input, Label, Table):
npx shadcn@latest init
npx shadcn@latest add button card input label table
Add Tailwind and alias
@/componentsper shadcn docs.
\n---\n\n## Running locally
# Root
pnpm i
# Worker
cd apps/worker && pnpm dev # wrangler dev
# Web
cd ../web && pnpm dev
Deploy Worker:
cd apps/worker
pnpm run deploy # wrangler publish
\n---\n\n## Usage example
POST /api/search with body:
{ "query": "Best Electricians in Southern Oregon", "provider": "programmable", "deepCrawl": true }
Response (shape):
{
"provider":"programmable",
"query":"Best Electricians in Southern Oregon",
"page":1,
"total":10,
"items":[{"title":"...","url":"https://...","snippet":"...","source":"google:cse"}]
}
\n---\n\n## Notes & hardening checklist
Respect
robots.txt& site terms; throttle with the Limiter DO.Rotate UA strings; implement exponential backoff + jitter.
Prefer JSON-LD (schema.org) parsing; fall back to heuristics.
Store enriched data in D1 (SQL) or KV; export CSV/JSON.
Add Cloudflare Queues dead‑letter queue.
Add retries + circuit breaker around the experimental SERP scraper.
Add tests with Miniflare for Worker logic.