This repository was archived by the owner on Dec 14, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
165 lines (153 loc) · 5.27 KB
/
Makefile
File metadata and controls
165 lines (153 loc) · 5.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
DATABASE_URL?=postgres://localhost:5432/ads
define LOAD_ADS_SQL
CREATE TEMPORARY TABLE tmp_adverts ( \
id integer NOT NULL, \
page_id character varying(25) NOT NULL, \
page_name text NULL, \
post_id character varying(25) NOT NULL, \
country character varying(10) NOT NULL, \
ad_creation_time timestamp with time zone NULL, \
ad_creative_body text NULL, \
ad_creative_link_caption text NULL, \
ad_creative_link_description text NULL, \
ad_creative_link_title text NULL, \
ad_delivery_start_time timestamp with time zone NULL, \
ad_delivery_stop_time timestamp with time zone NULL, \
ad_snapshot_url text NULL, \
image_link text NULL, \
currency character varying(10) NULL, \
funding_entity text NULL, \
created_at timestamp with time zone NULL, \
updated_at timestamp with time zone NULL, \
ad_info character varying NULL, \
text_search tsvector NULL, \
potential_reach json, \
publisher_platforms character varying[] \
); \
COPY tmp_adverts FROM STDIN DELIMITER ',' CSV HEADER; \
INSERT INTO adverts ( \
id, \
page_id, \
page_name, \
post_id, \
country, \
ad_creation_time, \
ad_creative_body, \
ad_creative_link_caption, \
ad_creative_link_description, \
ad_creative_link_title, \
ad_delivery_start_time, \
ad_delivery_stop_time, \
ad_snapshot_url, \
image_link, \
currency, \
funding_entity, \
created_at, \
updated_at, \
ad_info, \
text_search \
) SELECT \
id, \
page_id, \
page_name, \
post_id, \
country, \
ad_creation_time, \
ad_creative_body, \
ad_creative_link_caption, \
ad_creative_link_description, \
ad_creative_link_title, \
ad_delivery_start_time, \
ad_delivery_stop_time, \
ad_snapshot_url, \
image_link, \
currency, \
funding_entity, \
created_at, \
updated_at, \
ad_info, \
text_search \
FROM tmp_adverts \
ON CONFLICT ON CONSTRAINT adverts_pkey \
DO NOTHING; \
DROP TABLE tmp_adverts;
endef
define UPDATE_IMPRESSIONS_SQL
CREATE TEMPORARY TABLE tmp_impressions ( \
advert_id integer NOT NULL, \
lower_bound_spend integer NOT NULL, \
upper_bound_spend integer NOT NULL, \
lower_bound_impressions integer, \
upper_bound_impressions integer \
); \
COPY tmp_impressions FROM STDIN DELIMITER ',' CSV HEADER; \
UPDATE adverts \
SET spend_lower_bound = tmp_impressions.lower_bound_spend, \
spend_upper_bound = tmp_impressions.upper_bound_spend, \
impressions_lower_bound = tmp_impressions.lower_bound_impressions, \
impressions_upper_bound = tmp_impressions.upper_bound_impressions \
FROM tmp_impressions \
WHERE adverts.id = tmp_impressions.advert_id; \
DROP TABLE tmp_impressions;
endef
.PHONY: load-ads
load-ads: adverts.csv
@echo "$(shell ruby -e "require 'csv'; puts CSV.read('$^').length - 1") records\n"
cat $^ | psql ${DATABASE_URL} -Xc "${LOAD_ADS_SQL}"
MAX_ID=$(shell psql $(DATABASE_URL) -Xtc 'SELECT COALESCE(MAX(id), 0) FROM adverts')
adverts.csv:
$(if ${ADS_PG_URL},,$(error must set ADS_PG_URL))
@echo "Getting adverts with id over $(strip $(MAX_ID)) ...\n"
@psql $(ADS_PG_URL) -Xc "COPY (SELECT * FROM adverts WHERE id > $(MAX_ID)) TO STDOUT DELIMITER ',' CSV HEADER;" > $@
##
# we must use ad_delivery_start_time as ad_creation_time is not indexed
IMPRESSIONS_FROM=2020-08-01
IMPRESSIONS_TO=2020-09-01
define ALL_IMPRESSIONS_FROM_SQL
SELECT DISTINCT ON (adverts.id) \
adverts.id AS advert_id, \
impressions.lower_bound_spend, \
impressions.upper_bound_spend, \
impressions.lower_bound_impressions, \
impressions.upper_bound_impressions \
FROM adverts \
LEFT JOIN impressions ON adverts.id = impressions.advert_id \
WHERE ad_delivery_start_time BETWEEN '$(IMPRESSIONS_FROM)' AND '$(IMPRESSIONS_TO)' \
AND impressions.lower_bound_spend IS NOT NULL \
AND funding_entity IN( \
'TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE', \
'DONALD J. TRUMP FOR PRESIDENT, INC.', \
'BIDEN FOR PRESIDENT', \
'BIDEN VICTORY FUND', \
'Biden for President' \
) \
ORDER BY adverts.id DESC, impressions.created_at DESC
endef
impressions.csv:
$(if ${ADS_PG_URL},,$(error must set ADS_PG_URL))
@echo "Getting impressions between '$(IMPRESSIONS_FROM)' and '$(IMPRESSIONS_TO)' ...\n"
psql $(ADS_PG_URL) -Xc "COPY (${ALL_IMPRESSIONS_FROM_SQL}) TO STDOUT DELIMITER ',' CSV HEADER;" > $@
define UPDATE_ILLUMINATE_TAGS_SQL
CREATE TEMPORARY TABLE tmp_tags (row jsonb); \
COPY tmp_tags FROM STDIN; \
UPDATE adverts SET illuminate_tags = tmp_tags.row \
FROM tmp_tags \
WHERE row->>'ad_id' = adverts.post_id; \
DROP TABLE tmp_tags;
endef
.PHONY: update-impressions
update-impressions: impressions.csv
@echo "Updating impressions ..."
cat $^ | psql ${DATABASE_URL} -Xc "${UPDATE_IMPRESSIONS_SQL}"
results.json.gz:
$(if ${ILLUMINATE_BACKFILL_JSON_GZ_URL},,$(error must set ILLUMINATE_BACKFILL_JSON_GZ_URL))
curl -L ${ILLUMINATE_BACKFILL_JSON_GZ_URL} > $@
results.psql.json: results.json.gz # Strip array for psql COPY; One json {} stanza per line
gunzip -c $^ | ruby -e 'require "json"; JSON.parse(STDIN.read).each { |j| puts j.to_json }' > $@
.PHONY: backfill-illuminate-tags
backfill-illuminate-tags: results.psql.json
@echo "Backfilling illuminate tags"
cat $^ | psql ${DATABASE_URL} -Xc "${UPDATE_ILLUMINATE_TAGS_SQL}"
.PHONY: clean
clean:
- rm adverts.csv impressions.csv results.psql.json