Skip to content

Commit 9fbecff

Browse files
sql view for latest versions
1 parent 81d13eb commit 9fbecff

File tree

3 files changed

+84
-42
lines changed

3 files changed

+84
-42
lines changed

sql/latest_versions_view.sql

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
-- View for latest versions of each report series
2+
-- Run: psql $DATABASE_URL -f sql/latest_versions_view.sql
3+
4+
--------------------------------------------------------------------------------
5+
-- LATEST VERSIONS VIEW
6+
--------------------------------------------------------------------------------
7+
8+
DROP VIEW IF EXISTS sg_reports_survey.latest_versions;
9+
10+
CREATE VIEW sg_reports_survey.latest_versions AS
11+
WITH
12+
-- Count versions per series
13+
version_counts AS (
14+
SELECT proper_title, COUNT(*)::int as version_count
15+
FROM sg_reports_survey.reports
16+
WHERE proper_title IS NOT NULL
17+
AND symbol NOT LIKE '%/CORR.%'
18+
AND symbol NOT LIKE '%/REV.%'
19+
GROUP BY proper_title
20+
),
21+
-- Rank versions to find latest
22+
ranked AS (
23+
SELECT
24+
r.id, r.symbol, r.proper_title, r.title, r.date_year, r.publication_date,
25+
r.un_body, r.subject_terms, r.embedding,
26+
re.entity,
27+
COALESCE(r.date_year,
28+
CASE WHEN r.publication_date ~ '^\d{4}'
29+
THEN SUBSTRING(r.publication_date FROM 1 FOR 4)::int END
30+
) as effective_year,
31+
ROW_NUMBER() OVER (
32+
PARTITION BY r.proper_title
33+
ORDER BY
34+
COALESCE(r.date_year,
35+
CASE WHEN r.publication_date ~ '^\d{4}'
36+
THEN SUBSTRING(r.publication_date FROM 1 FOR 4)::int END
37+
) DESC NULLS LAST,
38+
r.publication_date DESC NULLS LAST,
39+
r.symbol DESC
40+
) as rn
41+
FROM sg_reports_survey.reports r
42+
LEFT JOIN sg_reports_survey.reporting_entities re ON r.symbol = re.symbol
43+
WHERE r.proper_title IS NOT NULL
44+
AND r.symbol NOT LIKE '%/CORR.%'
45+
AND r.symbol NOT LIKE '%/REV.%'
46+
)
47+
SELECT
48+
r.id, r.symbol, r.proper_title, r.title, r.date_year, r.publication_date,
49+
r.un_body, r.subject_terms, r.embedding, r.entity, r.effective_year,
50+
vc.version_count
51+
FROM ranked r
52+
JOIN version_counts vc ON r.proper_title = vc.proper_title
53+
WHERE r.rn = 1;
54+
55+
COMMENT ON VIEW sg_reports_survey.latest_versions IS
56+
'Latest version of each report series (by proper_title), with version_count';

src/app/api/sg-reports/route.ts

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -187,39 +187,33 @@ export async function GET(req: NextRequest) {
187187
WHERE ${whereClause}`,
188188
params
189189
),
190-
// Body counts by distinct proper_title
190+
// Body counts (from latest_versions view - one per series)
191191
query<{ body: string; count: number }>(
192-
`SELECT un_body as body, COUNT(DISTINCT proper_title)::int as count
193-
FROM ${DB_SCHEMA}.reports
194-
WHERE un_body IS NOT NULL AND proper_title IS NOT NULL
192+
`SELECT un_body as body, COUNT(*)::int as count
193+
FROM ${DB_SCHEMA}.latest_versions
194+
WHERE un_body IS NOT NULL
195195
GROUP BY un_body ORDER BY count DESC`
196196
),
197-
// Year range (min/max)
197+
// Year range (from latest_versions view)
198198
query<{ min_year: number; max_year: number }>(
199-
`SELECT
200-
MIN(COALESCE(date_year, CASE WHEN publication_date ~ '^\\d{4}' THEN SUBSTRING(publication_date FROM 1 FOR 4)::int END))::int as min_year,
201-
MAX(COALESCE(date_year, CASE WHEN publication_date ~ '^\\d{4}' THEN SUBSTRING(publication_date FROM 1 FOR 4)::int END))::int as max_year
202-
FROM ${DB_SCHEMA}.reports
203-
WHERE proper_title IS NOT NULL`
199+
`SELECT MIN(effective_year)::int as min_year, MAX(effective_year)::int as max_year
200+
FROM ${DB_SCHEMA}.latest_versions`
204201
),
205-
// Get subject term counts - count by unique report title (not by version/symbol)
206-
// Only include subjects that appear in more than one report (excluding credentials)
202+
// Subject term counts (from latest_versions - one per series, excluding credentials)
207203
query<SubjectCount>(
208-
`SELECT subject as subject, COUNT(DISTINCT proper_title)::int as count
209-
FROM ${DB_SCHEMA}.reports, unnest(subject_terms) as subject
210-
WHERE proper_title IS NOT NULL
211-
AND subject != 'Representative''s credentials'
204+
`SELECT subject, COUNT(*)::int as count
205+
FROM ${DB_SCHEMA}.latest_versions, unnest(subject_terms) as subject
206+
WHERE subject != 'Representative''s credentials'
212207
GROUP BY subject
213-
HAVING COUNT(DISTINCT proper_title) > 1
208+
HAVING COUNT(*) > 1
214209
ORDER BY count DESC, subject`
215210
),
216-
// Entity counts by distinct proper_title
211+
// Entity counts (from latest_versions view)
217212
query<{ entity: string; count: number }>(
218-
`SELECT re.entity, COUNT(DISTINCT r.proper_title)::int as count
219-
FROM ${DB_SCHEMA}.reporting_entities re
220-
JOIN ${DB_SCHEMA}.reports r ON re.symbol = r.symbol
221-
WHERE re.entity IS NOT NULL AND r.proper_title IS NOT NULL
222-
GROUP BY re.entity ORDER BY count DESC`
213+
`SELECT entity, COUNT(*)::int as count
214+
FROM ${DB_SCHEMA}.latest_versions
215+
WHERE entity IS NOT NULL
216+
GROUP BY entity ORDER BY count DESC`
223217
),
224218
]);
225219

src/app/api/similar-reports/route.ts

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ export async function GET(req: NextRequest) {
2424

2525
try {
2626
// Find similar reports using vector similarity search
27-
// Excludes other versions of the same report series (same proper_title)
27+
// Source from reports table (any version), results from latest_versions only
2828
const similar = await query<SimilarReport>(
2929
`WITH source AS (
3030
SELECT embedding, proper_title
@@ -33,31 +33,23 @@ export async function GET(req: NextRequest) {
3333
AND embedding IS NOT NULL
3434
)
3535
SELECT
36-
r.symbol,
37-
r.proper_title,
38-
COALESCE(
39-
r.date_year,
40-
CASE WHEN r.publication_date ~ '^\\d{4}'
41-
THEN SUBSTRING(r.publication_date FROM 1 FOR 4)::int END
42-
) as year,
43-
1 - (r.embedding <=> s.embedding) as similarity,
44-
re.entity
45-
FROM ${DB_SCHEMA}.reports r
36+
lv.symbol,
37+
lv.proper_title,
38+
lv.effective_year as year,
39+
1 - (lv.embedding <=> s.embedding) as similarity,
40+
lv.entity
41+
FROM ${DB_SCHEMA}.latest_versions lv
4642
CROSS JOIN source s
47-
LEFT JOIN ${DB_SCHEMA}.reporting_entities re ON r.symbol = re.symbol
48-
WHERE r.embedding IS NOT NULL
49-
AND (s.proper_title IS NULL OR r.proper_title IS NULL OR TRIM(r.proper_title) != TRIM(s.proper_title))
50-
AND r.symbol != $1
51-
AND r.symbol NOT LIKE '%/CORR.%'
52-
AND r.symbol NOT LIKE '%/REV.%'
53-
ORDER BY r.embedding <=> s.embedding
43+
WHERE lv.embedding IS NOT NULL
44+
AND (s.proper_title IS NULL OR lv.proper_title IS NULL OR TRIM(lv.proper_title) != TRIM(s.proper_title))
45+
AND lv.symbol != $1
46+
ORDER BY lv.embedding <=> s.embedding
5447
LIMIT $2`,
5548
[symbol, limit]
5649
);
5750

5851
// If no embedding exists for the source report, return empty
5952
if (similar.length === 0) {
60-
// Check if the source report has an embedding
6153
const hasEmbedding = await query<{ has_embedding: boolean }>(
6254
`SELECT embedding IS NOT NULL as has_embedding
6355
FROM ${DB_SCHEMA}.reports

0 commit comments

Comments
 (0)