|
| 1 | + |
| 2 | + |
| 3 | +#### Export from BigQuery to GCS (CSV) |
| 4 | + |
| 5 | +CSV is easiest to ingest to Postgres; export in shards. |
| 6 | + |
| 7 | +```sql |
| 8 | + |
| 9 | +EXPORT DATA OPTIONS ( |
| 10 | + uri='gs://YOUR_BUCKET/overture/place_min_*.csv', |
| 11 | + format='CSV', |
| 12 | + overwrite=true, |
| 13 | + header=true, |
| 14 | + field_delimiter=',' |
| 15 | +) |
| 16 | +AS |
| 17 | +SELECT * FROM ( |
| 18 | +WITH flat AS ( |
| 19 | + SELECT |
| 20 | + id, |
| 21 | + ST_X(geometry) AS lon, |
| 22 | + ST_Y(geometry) AS lat, |
| 23 | + names.primary AS name, |
| 24 | + categories.primary AS category, |
| 25 | + websites.list[SAFE_OFFSET(0)].element AS website, |
| 26 | + phones.list[SAFE_OFFSET(0)].element AS phone, |
| 27 | + emails.list[SAFE_OFFSET(0)].element AS email, |
| 28 | + socials.list[SAFE_OFFSET(0)].element AS social, |
| 29 | + addresses.list[SAFE_OFFSET(0)].element.freeform AS address, |
| 30 | + addresses.list[SAFE_OFFSET(0)].element.locality AS locality, |
| 31 | + addresses.list[SAFE_OFFSET(0)].element.postcode AS postcode, |
| 32 | + addresses.list[SAFE_OFFSET(0)].element.country AS country, |
| 33 | + |
| 34 | + brand.wikidata AS wikidata, |
| 35 | + confidence AS confidence, |
| 36 | + sources.list[SAFE_OFFSET(0)].element.dataset AS src_dataset, |
| 37 | + sources.list[SAFE_OFFSET(0)].element.record_id AS src_record_id, |
| 38 | + TIMESTAMP(sources.list[SAFE_OFFSET(0)].element.update_time) AS src_update_time, |
| 39 | + confidence |
| 40 | + FROM `bigquery-public-data.overture_maps.place` |
| 41 | +) |
| 42 | +SELECT * FROM flat |
| 43 | + |
| 44 | +); |
| 45 | +``` |
| 46 | + |
| 47 | + |
| 48 | +#### Load into Postgres |
| 49 | + |
| 50 | +``` |
| 51 | +# pull from GCS to a machine that has psql |
| 52 | +gcloud storage cp gs://YOUR_BUCKET/overture/place_min_*.csv . |
| 53 | +
|
| 54 | +# bulk load |
| 55 | +psql "$PGURL" -c "\copy place_min(id,name,category,lon,lat,website,phone,email,social,address,locality,postcode,country,wikidata,confidence,src_dataset,src_record_id,src_update_time,confidence) FROM PROGRAM 'cat place_min_*.csv' CSV HEADER" |
| 56 | +``` |
| 57 | + |
| 58 | + |
| 59 | +#### Incremental updates (recommended) |
| 60 | + |
| 61 | +Overture refreshes; avoid full reloads. |
| 62 | + |
| 63 | +BigQuery delta extract (only rows updated since your last successful run): |
| 64 | + |
| 65 | +``` |
| 66 | +DECLARE last_run TIMESTAMP DEFAULT TIMESTAMP('2025-08-01 00:00:00+00'); -- replace via param/metadata |
| 67 | +
|
| 68 | +EXPORT DATA OPTIONS ( |
| 69 | + uri='gs://YOUR_BUCKET/overture/delta/place_min_@{run_date}_*.csv', |
| 70 | + format='CSV', overwrite=true, header=true |
| 71 | +) |
| 72 | +AS |
| 73 | +SELECT * |
| 74 | +FROM ( |
| 75 | + -- same SELECT as step 2 |
| 76 | +) |
| 77 | +WHERE src_update_time >= last_run; |
| 78 | +``` |
| 79 | + |
| 80 | +Postgres upsert via a staging table: |
| 81 | + |
| 82 | +```sql |
| 83 | +CREATE TEMP TABLE place_min_stg (LIKE place_min); |
| 84 | + |
| 85 | +\copy place_min_stg(id,name,category,lon,lat,website,phone,email,social,address,locality,postcode,country,wikidata,confidence,src_dataset,src_record_id,src_update_time,confidence) FROM 'delta_files.csv' CSV HEADER; |
| 86 | + |
| 87 | +INSERT INTO place_min AS t ( |
| 88 | + id,name,category,lon,lat,website,phone,email,social,address,locality,postcode,country,wikidata,confidence,src_dataset,src_record_id,src_update_time,confidence |
| 89 | +) |
| 90 | +SELECT * FROM place_min_stg s |
| 91 | +ON CONFLICT (id) DO UPDATE |
| 92 | +SET |
| 93 | + name = EXCLUDED.name, |
| 94 | + category = EXCLUDED.category, |
| 95 | + lon = EXCLUDED.lon, |
| 96 | + lat = EXCLUDED.lat, |
| 97 | + website = EXCLUDED.website, |
| 98 | + phone = EXCLUDED.phone, |
| 99 | + email = EXCLUDED.email, |
| 100 | + social = EXCLUDED.social, |
| 101 | + address = EXCLUDED.address, |
| 102 | + locality = EXCLUDED.locality, |
| 103 | + postcode = EXCLUDED.postcode, |
| 104 | + country = EXCLUDED.country, |
| 105 | + wikidata = EXCLUDED.wikidata, |
| 106 | + confidence = EXCLUDED.confidence, |
| 107 | + src_dataset = EXCLUDED.src_dataset, |
| 108 | + src_record_id = EXCLUDED.src_record_id, |
| 109 | + src_update_time = EXCLUDED.src_update_time, |
| 110 | + confidence = EXCLUDED.confidence |
| 111 | +WHERE t.src_update_time IS NULL OR EXCLUDED.src_update_time > t.src_update_time; |
| 112 | +``` |
| 113 | + |
| 114 | +#### Sanity checks & sample query |
| 115 | + |
| 116 | +``` |
| 117 | +-- record counts |
| 118 | +SELECT COUNT(*) FROM place_min; |
| 119 | +
|
| 120 | +-- quick Vienna radius test (500 m) |
| 121 | +SELECT id, name, category |
| 122 | +FROM place_min |
| 123 | +WHERE ST_DWithin( |
| 124 | + geom, |
| 125 | + ST_SetSRID(ST_MakePoint(16.3738, 48.2082),4326)::geography, |
| 126 | + 500 |
| 127 | +) |
| 128 | +ORDER BY confidence DESC NULLS LAST |
| 129 | +LIMIT 50; |
| 130 | +``` |
| 131 | + |
| 132 | +#### Notes & options |
| 133 | + |
| 134 | +PostGIS type: I used geography(Point,4326) for easy meters-based ST_DWithin. If you prefer geometry(Point,4326), use meters via geography(geom) or transform appropriately. |
| 135 | + |
| 136 | +Multiple values: I picked the first website/phone/social/address. If you want arrays, we can keep them as Postgres text[] (and flatten in BQ with ARRAY_TO_STRING(..., '|')). |
| 137 | + |
| 138 | +Performance: On 20–30M rows, COPY + ON CONFLICT is still very fast. For huge refreshes, consider: |
| 139 | + |
| 140 | +- Load deltas into a permanent place_min_delta table and run a single INSERT ... ON CONFLICT. |
| 141 | + |
| 142 | +- Periodic VACUUM (ANALYZE) on place_min. |
| 143 | + |
| 144 | +Automation: Wrap steps in a small script (Cloud Build, GitLab CI, or a cron on a VM). |
0 commit comments