GPT Typescript & React Web Scraper

https://chatgpt.com/canvas/shared/68ebf7c973cc81918680d700eb83d3a4

High‑End Web Crawler/Scraper + UI (TypeScript)\n\nTarget stack: Cloudflare Workers (edge) + Hono, Puppeteer via Workers Browser Rendering, Queues + Durable Objects, React 19 + Vite + shadcn/ui.\n\n> ⚖️ Note: Directly scraping Google Search pages may violate Google’s ToS.

This starter ships two providers: a compliant default (Google Programmable Search JSON API) and an experimental SERP scraper behind a feature flag. Use the API provider in production.\n\n---\n\n## Monorepo layout\n\n```

apps/
worker/ # Cloudflare Worker API, crawlers, queues
src/
index.ts # Hono app, routes
providers/
googleProgrammableSearch.ts
googleSerpScraper.ts # experimental (feature-flagged)
crawl/
crawlUrls.ts
extractors/
schemaOrg.ts
generic.ts
rate-limit/
limiter.ts # Durable Objects rate limiter
wrangler.toml
package.json
tsconfig.json
web/ # React 19 + Vite + shadcn/ui
src/
main.tsx
App.tsx
components/
SearchForm.tsx
ResultsTable.tsx
JobSidebar.tsx
lib/
api.ts
useJob.ts
index.html
package.json
tsconfig.json
packages/
shared/
src/types.ts
package.json

\n---\n\n## Cloudflare Worker — `apps/worker/src/index.ts`\n\n```ts
// apps/worker/src/index.ts
import { Hono } from 'hono'
import { cors } from 'hono/cors'
import { prettyJSON } from 'hono/pretty-json'
import { googleProgrammableSearch } from './providers/googleProgrammableSearch'
import { googleSerpScraper } from './providers/googleSerpScraper' // experimental
import { enqueueCrawl } from './crawl/crawlUrls'
import type { Env, SearchRequest, SearchResponse } from 'shared/types'

const app = new Hono<{ Bindings: Env }>()
app.use('*', cors())
app.use('*', prettyJSON())

app.get('/health', (c) => c.text('ok'))

// Kick off a search job (Google provider + optional follow-up crawling)
app.post('/api/search', async (c) => {
  const body = (await c.req.json()) as SearchRequest
  const provider = body.provider ?? 'programmable'
  const query = body.query?.trim()
  if (!query) return c.json({ error: 'Missing query' }, 400)

  let results: SearchResponse
  if (provider === 'programmable') {
    results = await googleProgrammableSearch(c.env, { query, page: body.page ?? 1, region: body.region })
  } else {
    // experimental: requires `BROWSER` binding in wrangler.toml
    results = await googleSerpScraper(c.env, { query, page: body.page ?? 1, region: body.region })
  }

  // Optionally queue deep-crawling of result URLs for structured data enrichment
  if (body.deepCrawl) {
    await enqueueCrawl(c.env, results.items.map((i) => i.url))
  }

  return c.json(results)
})

// Ingestor for the crawl queue (consumer)
export default {
  async fetch(request: Request, env: Env, ctx: ExecutionContext) {
    return app.fetch(request, env, ctx)
  },
  async queue(batch: MessageBatch<string>, env: Env, ctx: ExecutionContext) {
    for (const msg of batch.messages) {
      try {
        const { crawlAndExtract } = await import('./crawl/crawlUrls')
        const enriched = await crawlAndExtract(env, msg.body)
        // TODO: persist enriched data (KV/D1/R2) or forward to webhook
        console.log('Enriched', enriched.url)
        msg.ack()
      } catch (e) {
        console.error('Crawl failed', e)
        msg.retry()
      }
    }
  },
}

\n---\n\n## Google Provider (compliant default) — googleProgrammableSearch.ts\n\n```ts
// apps/worker/src/providers/googleProgrammableSearch.ts
import type { Env, SearchResponse } from 'shared/types'

const ENDPOINT = 'https://www.googleapis.com/customsearch/v1'

export async function googleProgrammableSearch(env: Env, {
query,
page = 1,
region,
}: { query: string; page?: number; region?: string }): Promise {
const start = (page - 1) * 10 + 1
const u = new URL(ENDPOINT)
u.searchParams.set('key', env.GOOGLE_API_KEY)
u.searchParams.set('cx', env.GOOGLE_CSE_ID)
u.searchParams.set('q', query)
u.searchParams.set('start', String(start))
if (region) u.searchParams.set('gl', region) // country code; optional

const res = await fetch(u.toString())
if (!res.ok) throw new Error(Google API error: ${res.status})
const json = await res.json()

const items = (json.items ?? []).map((it: any) => ({
title: it.title as string,
url: it.link as string,
snippet: it.snippet as string,
source: 'google:cse' as const,
}))

return { provider: 'programmable', query, page, items, total: json.searchInformation?.totalResults ? Number(json.searchInformation.totalResults) : items.length }
}

\n---\n\n## Experimental SERP Scraper (feature‑flag) — `googleSerpScraper.ts`\n\n> Requires **Workers Browser Rendering** (`BROWSER` binding) and abides by your own compliance policy. Backoff + rate‑limit via Durable Objects is strongly recommended.\n\n```ts
// apps/worker/src/providers/googleSerpScraper.ts
import type { Env, SearchResponse } from 'shared/types'

export async function googleSerpScraper(env: Env, {
  query, page = 1, region,
}: { query: string; page?: number; region?: string }): Promise<SearchResponse> {
  const puppeteer = await import('@cloudflare/puppeteer')
  // region code influences Google locale via hl/gl query params
  const q = new URL('https://www.google.com/search')
  q.searchParams.set('q', query)
  if (region) q.searchParams.set('gl', region)
  if (page > 1) q.searchParams.set('start', String((page - 1) * 10))

  const browser = await puppeteer.launch(env.BROWSER)
  const pageObj = await browser.newPage()
  await pageObj.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36')
  await pageObj.goto(q.toString(), { waitUntil: 'networkidle0' })

  const items = await pageObj.$$eval('a[href^="http"] h3', (hs) => {
    return hs.map((h) => {
      const a = h.closest('a') as HTMLAnchorElement | null
      const title = (h as HTMLElement).innerText
      const url = a?.href || ''
      const snippetEl = a?.parentElement?.parentElement?.querySelector('div[data-sncf]') as HTMLElement | null
      const snippet = snippetEl?.innerText || ''
      return { title, url, snippet, source: 'google:html' as const }
    })
  })

  await browser.close()
  // filter obvious Google redirectors or tracking params
  const cleaned = items.filter(i => i.url && !i.url.includes('/search?') && !i.url.includes('googleusercontent.com'))
  return { provider: 'serp-scraper', query, page, items: cleaned, total: cleaned.length }
}

\n---\n\n## Crawl & Extract — crawl/crawlUrls.ts\n\n```ts
// apps/worker/src/crawl/crawlUrls.ts
import { HTMLRewriter } from 'html-rewriter'
import type { Env, EnrichedPage } from 'shared/types'
import { parseSchemaOrgJsonLd } from './extractors/schemaOrg'

export async function enqueueCrawl(env: Env, urls: string[]) {
for (const u of urls) await env.CRAWL_QUEUE.send(u)
}

export async function crawlAndExtract(env: Env, url: string): Promise {
const res = await fetch(url, { headers: { 'User-Agent': 'LeverageAI-Crawler/1.0 (+https://example.com/bot)' } })
const contentType = res.headers.get('content-type') || ''
const html = await res.text()

// Extract with JSON-LD first (LocalBusiness, Organization, etc.)
const structured = parseSchemaOrgJsonLd(html)

// Lightweight extraction with HTMLRewriter for title/phone/email fallbacks
let title = ''
let email = ''
const rewriter = new HTMLRewriter()
.on('title', { text(t) { title += t.text } })
.on('a[href^="mailto:"]', { element(el) { const m = el.getAttribute('href')?.replace('mailto:', '') || ''; if (!email) email = m } })

await rewriter.transform(new Response(html, { headers: { 'content-type': contentType } })).text()

return { url, title: title.trim(), email, structured }
}

\n---\n\n## Schema.org JSON‑LD Parser — `extractors/schemaOrg.ts`\n\n```ts
// apps/worker/src/crawl/extractors/schemaOrg.ts
import type { StructuredData } from 'shared/types'

export function parseSchemaOrgJsonLd(html: string): StructuredData[] {
  const scripts = [...html.matchAll(/<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi)]
  const out: StructuredData[] = []
  for (const [, json] of scripts) {
    try {
      const data = JSON.parse(json)
      const arr = Array.isArray(data) ? data : [data]
      for (const d of arr) out.push({
        '@type': d['@type'],
        name: d.name, telephone: d.telephone, url: d.url, address: d.address, sameAs: d.sameAs,
      })
    } catch {}
  }
  return out
}

\n---\n\n## Durable Objects Rate Limiter — rate-limit/limiter.ts\n\n```ts
// apps/worker/src/rate-limit/limiter.ts
export class Limiter {
state: DurableObjectState
constructor(state: DurableObjectState) { this.state = state }
async fetch(req: Request) {
const { key, limit = 30, window = 60 } = await req.json()
const now = Date.now()
const bucket = await this.state.storage.get<{ count: number; reset: number }>(key) || { count: 0, reset: now + window * 1000 }
if (now > bucket.reset) { bucket.count = 0; bucket.reset = now + window * 1000 }
if (++bucket.count > limit) return new Response('rate_limited', { status: 429 })
await this.state.storage.put(key, bucket)
return new Response(JSON.stringify({ remaining: limit - bucket.count, reset: bucket.reset }), { headers: { 'content-type': 'application/json' } })
}
}

\n---\n\n## Worker bindings — `apps/worker/wrangler.toml`\n\n```toml
name = "leverageai-crawler"
main = "src/index.ts"
compatibility_date = "2025-09-15"

[vars]
GOOGLE_CSE_ID = "replaceme"
GOOGLE_API_KEY = "replaceme"

[[durable_objects.bindings]]
name = "RATE_LIMITER"
class_name = "Limiter"

[[queues.producers]]
queue = "CRAWL_QUEUE"
binding = "CRAWL_QUEUE"

[[queues.consumers]]
queue = "CRAWL_QUEUE"

[browser]
binding = "BROWSER" # required for experimental SERP scraper

\n---\n\n## Shared types — packages/shared/src/types.ts\n\n```ts
// packages/shared/src/types.ts
export type Env = {
GOOGLE_API_KEY: string
GOOGLE_CSE_ID: string
CRAWL_QUEUE: Queue
BROWSER: any
}

export type SearchRequest = {
query: string
page?: number
region?: string
provider?: 'programmable' | 'serp-scraper'
deepCrawl?: boolean
}

export type SearchItem = { title: string; url: string; snippet?: string; source: 'google:cse' | 'google:html' }
export type SearchResponse = { provider: string; query: string; page: number; items: SearchItem[]; total: number }

export type StructuredData = { '@type'?: string; name?: string; telephone?: string; url?: string; address?: unknown; sameAs?: string[] }
export type EnrichedPage = { url: string; title?: string; email?: string; structured?: StructuredData[] }

\n---\n\n## React 19 + shadcn UI — `apps/web/src/App.tsx`\n\n```tsx
// apps/web/src/App.tsx
import { useState } from 'react'
import { SearchForm } from './components/SearchForm'
import { ResultsTable } from './components/ResultsTable'
import { JobSidebar } from './components/JobSidebar'

export default function App() {
  const [rows, setRows] = useState<any[]>([])
  return (
    <div className="min-h-svh grid grid-cols-12 gap-6 p-6 bg-background text-foreground">
      <div className="col-span-8 space-y-6">
        <SearchForm onResults={setRows} />
        <ResultsTable rows={rows} />
      </div>
      <div className="col-span-4">
        <JobSidebar />
      </div>
    </div>
  )
}

\n---\n\n### SearchForm.tsx

import { useState, useTransition } from 'react'
import { Input } from '@/components/ui/input'
import { Button } from '@/components/ui/button'
import { Label } from '@/components/ui/label'
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
import { api } from '../lib/api'

export function SearchForm({ onResults }: { onResults: (rows: any[]) => void }) {
  const [query, setQuery] = useState('Best Electricians in Southern Oregon')
  const [deepCrawl, setDeepCrawl] = useState(true)
  const [isPending, startTransition] = useTransition()

  return (
    <Card>
      <CardHeader><CardTitle>Google Search + Crawl</CardTitle></CardHeader>
      <CardContent className="space-y-3">
        <Label htmlFor="q">Query</Label>
        <Input id="q" value={query} onChange={(e) => setQuery(e.target.value)} />
        <div className="flex items-center gap-2">
          <input id="deep" type="checkbox" checked={deepCrawl} onChange={(e) => setDeepCrawl(e.target.checked)} />
          <Label htmlFor="deep">Deep crawl result sites (enrich with schema.org)</Label>
        </div>
        <div className="flex gap-2">
          <Button disabled={isPending} onClick={() => startTransition(async () => {
            const res = await api.search({ query, deepCrawl, provider: 'programmable' })
            onResults(res.items)
          })}>Search</Button>
          <Button variant="secondary" disabled={isPending} onClick={() => startTransition(async () => {
            const res = await api.search({ query, deepCrawl, provider: 'serp-scraper' })
            onResults(res.items)
          })}>Try Experimental Scraper</Button>
        </div>
      </CardContent>
    </Card>
  )
}

\n---\n\n### ResultsTable.tsx

import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'

export function ResultsTable({ rows }: { rows: any[] }) {
  return (
    <Card>
      <CardHeader><CardTitle>Results ({rows.length})</CardTitle></CardHeader>
      <CardContent>
        <div className="grid gap-3">
          {rows.map((r, i) => (
            <a key={i} href={r.url} target="_blank" className="p-3 rounded-xl border hover:bg-muted">
              <div className="font-medium">{r.title}</div>
              <div className="text-sm text-muted-foreground break-all">{r.url}</div>
              {r.snippet && <div className="text-sm mt-1">{r.snippet}</div>}
            </a>
          ))}
        </div>
      </CardContent>
    </Card>
  )
}

\n---\n\n### JobSidebar.tsx

import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
export function JobSidebar() {
  return (
    <Card>
      <CardHeader><CardTitle>Job Status</CardTitle></CardHeader>
      <CardContent className="text-sm text-muted-foreground">
        Deep crawl jobs are queued and processed at the edge. Refresh to see enriched data in your store.
      </CardContent>
    </Card>
  )
}

\n---\n\n### Simple API helper — apps/web/src/lib/api.ts

export const api = {
  async search(payload: any) {
    const base = import.meta.env.VITE_API_BASE || '/'
    const r = await fetch(base + 'api/search', { method: 'POST', headers: { 'content-type': 'application/json' }, body: JSON.stringify(payload) })
    if (!r.ok) throw new Error('API error')
    return r.json()
  },
}

\n---\n\n## Vite + shadcn setup (web)

  1. npm i -D vite typescript tailwindcss postcss autoprefixer

  2. npm i react react-dom

  3. Install shadcn/ui and components (Button, Card, Input, Label, Table):

npx shadcn@latest init
npx shadcn@latest add button card input label table
  1. Add Tailwind and alias @/components per shadcn docs.
    \n---\n\n## Running locally

# Root
pnpm i
# Worker
cd apps/worker && pnpm dev  # wrangler dev
# Web
cd ../web && pnpm dev

Deploy Worker:

cd apps/worker
pnpm run deploy  # wrangler publish

\n---\n\n## Usage example

POST /api/search with body:

{ "query": "Best Electricians in Southern Oregon", "provider": "programmable", "deepCrawl": true }

Response (shape):

{
  "provider":"programmable",
  "query":"Best Electricians in Southern Oregon",
  "page":1,
  "total":10,
  "items":[{"title":"...","url":"https://...","snippet":"...","source":"google:cse"}]
}

\n---\n\n## Notes & hardening checklist

  • Respect robots.txt & site terms; throttle with the Limiter DO.

  • Rotate UA strings; implement exponential backoff + jitter.

  • Prefer JSON-LD (schema.org) parsing; fall back to heuristics.

  • Store enriched data in D1 (SQL) or KV; export CSV/JSON.

  • Add Cloudflare Queues dead‑letter queue.

  • Add retries + circuit breaker around the experimental SERP scraper.

  • Add tests with Miniflare for Worker logic.