Skip to content

Commit 3211f66

Browse files
committed
πŸŒ‹ πŸ’£ πŸ’₯ 🧨 ranker: duplicate fixing for most common universities
Signed-off-by: Saif Ul Islam <[email protected]>
1 parent a17c9a1 commit 3211f66

File tree

6 files changed

+222
-27
lines changed

6 files changed

+222
-27
lines changed

β€Žprojects/rank-nsf-linker/server/db.goβ€Ž

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1305,6 +1305,41 @@ func removeDuplicateEntries() error {
13051305
"University of Maryland College Park",
13061306
"University Of Maryland College Park",
13071307
"University Of Maryland, College Park",
1308+
1309+
// Purdue University variants
1310+
"Indiana University-Purdue University At Indianapolis",
1311+
1312+
// University Of Massachusetts Amherst
1313+
"University of Massachusetts Amherst",
1314+
"University Of Massachusetts Amherst",
1315+
"University Of Massachusetts - Amherst",
1316+
1317+
// University Of California, Irvine
1318+
"University Of California Irvine",
1319+
"Univ. of California - Irvine",
1320+
"University Of California At Irvine",
1321+
"University Of California, Irvine",
1322+
"Univ of California Irvine",
1323+
"Univ Of California Irvine",
1324+
1325+
// University Of California, Santa Barbara
1326+
"University Of California Santa Barbara",
1327+
"University Of Califonia, Santa Barbara",
1328+
"Univ. of California - Santa Barbara",
1329+
"Univ of California Santa Barbara",
1330+
"University Of California, Santa Barbara",
1331+
"Univ Of California Santa Barbara",
1332+
1333+
// University Of North Carolina Chapel Hill
1334+
"University Of North Carolina Chapel Hill",
1335+
"University Of North Carolina, Chapel Hill",
1336+
1337+
// University Of Minnesota Twin Cities
1338+
"University Of Minnesota Twin Cities",
1339+
"University Of Minnesota - Twin Cities",
1340+
1341+
// University Of Pennsylvania State University
1342+
"Penn State University, University Park",
13081343
}},
13091344
} {
13101345
for column, values := range duplicates {
@@ -1356,6 +1391,122 @@ func removeTagsFromProfessorNames() error {
13561391
return nil
13571392
}
13581393

1394+
func copyLabsFromUniversitiesToLabsTable() error {
1395+
db, err := GetDB()
1396+
if err != nil {
1397+
logger.Errorf("❌ Cannot get DB: %v", err)
1398+
return fmt.Errorf("cannot get DB: %w", err)
1399+
}
1400+
1401+
logger.Infof("πŸ”Ž Searching for labs in universities table...")
1402+
1403+
// Find all labs in universities table
1404+
rows, err := db.Query(`
1405+
SELECT institution, street_address, city, phone, zip_code, country, region, countryabbrv, homepage, latitude, longitude
1406+
FROM universities
1407+
WHERE institution ILIKE '%lab%'
1408+
OR institution ILIKE '%laboratory%'
1409+
OR institution ILIKE '%research center%'
1410+
OR institution ILIKE '%research centre%'
1411+
OR institution ILIKE '%llc%'
1412+
OR institution ILIKE '%inc%'
1413+
OR institution ILIKE '%dept%'
1414+
`)
1415+
if err != nil {
1416+
logger.Errorf("❌ Failed to query labs from universities: %v", err)
1417+
return fmt.Errorf("failed to query labs from universities: %w", err)
1418+
}
1419+
1420+
defer rows.Close()
1421+
1422+
var labs []LabModel
1423+
var scannedRows int
1424+
1425+
for rows.Next() {
1426+
var institution, streetAddr, city, phone, zipCode, country, region, countryAbbrv, homepage string
1427+
var latitude, longitude float64
1428+
rowScanErr := rows.Scan(&institution, &streetAddr, &city, &phone, &zipCode, &country, &region, &countryAbbrv, &homepage, &latitude, &longitude)
1429+
if rowScanErr != nil {
1430+
logger.Errorf("❌ Failed to scan lab row: %v", rowScanErr)
1431+
return fmt.Errorf("failed to scan lab row: %w", rowScanErr)
1432+
}
1433+
labs = append(labs, LabModel{
1434+
Insitution: institution,
1435+
StreetAddress: streetAddr,
1436+
City: city,
1437+
Phone: phone,
1438+
ZipCode: zipCode,
1439+
Country: country,
1440+
Region: region,
1441+
CountryAbbrv: countryAbbrv,
1442+
Homepage: homepage,
1443+
Latitude: latitude,
1444+
Longitude: longitude,
1445+
})
1446+
scannedRows++
1447+
}
1448+
1449+
if len(labs) == 0 {
1450+
logger.Warnf("⚠️ No labs found in universities table to copy.")
1451+
return nil
1452+
}
1453+
1454+
logger.Infof("Found %d labs to copy from universities table.", scannedRows)
1455+
1456+
// Insert labs into labs table
1457+
tx, err := db.Begin()
1458+
if err != nil {
1459+
logger.Errorf("❌ Failed to begin transaction: %v", err)
1460+
return fmt.Errorf("failed to begin transaction: %w", err)
1461+
}
1462+
1463+
stmt, err := tx.Prepare(`
1464+
INSERT INTO labs (lab, street_address, city, phone, zip_code, country, region, countryabbrv, homepage, latitude, longitude)
1465+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
1466+
ON CONFLICT (lab) DO NOTHING;
1467+
`)
1468+
if err != nil {
1469+
logger.Errorf("❌ Failed to prepare insert statement: %v", err)
1470+
tx.Rollback()
1471+
return fmt.Errorf("failed to prepare insert statement: %w", err)
1472+
}
1473+
1474+
defer stmt.Close()
1475+
1476+
var insertedLabs int
1477+
for _, lab := range labs {
1478+
_, statementExecErr := stmt.Exec(lab.Insitution, lab.StreetAddress, lab.City, lab.Phone, lab.ZipCode, lab.Country, lab.Region, lab.CountryAbbrv, lab.Homepage, lab.Latitude, lab.Longitude)
1479+
if statementExecErr != nil {
1480+
logger.Errorf("❌ Failed to insert lab '%s': %v", lab.Insitution, statementExecErr)
1481+
tx.Rollback()
1482+
return fmt.Errorf("failed to insert lab: %w", statementExecErr)
1483+
}
1484+
insertedLabs++
1485+
logger.Debugf("βž• Inserted lab '%s' into labs table.", lab.Insitution)
1486+
}
1487+
1488+
logger.Infof("πŸ—‘οΈ Deleting copied labs from universities table...")
1489+
1490+
var deletedLabs int
1491+
for _, lab := range labs {
1492+
if _, err := tx.Exec(`DELETE FROM universities WHERE institution = $1`, lab.Insitution); err != nil {
1493+
logger.Errorf("❌ Failed to delete lab '%s' from universities: %v", lab.Insitution, err)
1494+
tx.Rollback()
1495+
return fmt.Errorf("failed to delete lab from universities: %w", err)
1496+
}
1497+
deletedLabs++
1498+
logger.Debugf("πŸ—‘οΈ Deleted lab '%s' from universities table.", lab.Insitution)
1499+
}
1500+
1501+
if err := tx.Commit(); err != nil {
1502+
logger.Errorf("❌ Failed to commit transaction: %v", err)
1503+
return fmt.Errorf("failed to commit transaction: %w", err)
1504+
}
1505+
1506+
logger.Infof("βœ… Copied %d labs from universities to labs table and deleted them from universities.", insertedLabs)
1507+
return nil
1508+
}
1509+
13591510
func markPipelineAsCompleted(step string, status string) {
13601511
db, err := GetDB()
13611512
if err != nil {
@@ -1497,6 +1648,12 @@ func populatePostgres() {
14971648
return
14981649
}
14991650

1651+
if initProgressErr := copyLabsFromUniversitiesToLabsTable(); initProgressErr != nil {
1652+
logger.Errorf("failed to copy labs from universities to labs table: %v", initProgressErr)
1653+
markPipelineAsCompleted(string(PIPELINE_POPULATE_POSTGRES), string(PIPELINE_STATUS_FAILED))
1654+
return
1655+
}
1656+
15001657
logger.Infof("πŸŽ‰ Postgres population completed successfully.")
15011658
markPipelineAsCompleted(string(PIPELINE_POPULATE_POSTGRES), string(PIPELINE_STATUS_COMPLETED))
15021659
markPipelineAsCompleted(string(POPULATION_SUCCEEDED_MESSAGE), string(POPULATION_STATUS_SUCCEEDED))

β€Žprojects/rank-nsf-linker/server/main.goβ€Ž

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ func main() {
7373
// we can mark it as completed here.
7474
markPipelineAsCompleted(string(PIPELINE_POPULATE_POSTGRES), string(PIPELINE_STATUS_COMPLETED))
7575

76+
if initProgressErr := copyLabsFromUniversitiesToLabsTable(); initProgressErr != nil {
77+
logger.Errorf("failed to copy labs from universities to labs table: %v", initProgressErr)
78+
}
79+
7680
if skipMigrations := os.Getenv(POPULATE_DB_FLAG); len(skipMigrations) == 0 {
7781
populatePostgres()
7882
}

β€Žprojects/rank-nsf-linker/server/migrations/2_summary_view.sqlβ€Ž

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,35 +9,41 @@ SELECT
99
u.city as city,
1010
u.region as region,
1111
u.country as country,
12-
u.homepage as homepage,
12+
COALESCE(CASE
13+
WHEN u.homepage LIKE 'https://%' THEN u.homepage
14+
ELSE 'https://' || u.homepage
15+
END, '') AS homepage,
1316
JSONB_BUILD_OBJECT (
1417
'total_faculty',
1518
COUNT(p.name),
1619
'total_funding',
17-
SUM(a.award_amount),
20+
COALESCE(SUM(a.award_amount), 0),
1821
'award_count',
1922
COUNT(a.id),
2023
'active_awards',
21-
SUM(
24+
COALESCE(SUM(
2225
CASE WHEN a.award_expiry_date :: date > CURRENT_DATE THEN 1 ELSE 0 END
23-
)
26+
), 0)
2427
) as stats,
25-
(
26-
SELECT
27-
JSONB_OBJECT_AGG(area, area_count)
28-
FROM
29-
(
30-
SELECT
31-
pf.area,
32-
COUNT(*) AS area_count
33-
FROM
34-
professor_areas pf
35-
WHERE
36-
pf.affiliation = u.institution
37-
GROUP BY
38-
pf.area
39-
) area_counts
40-
) AS research_areas
28+
COALESCE(
29+
(
30+
SELECT
31+
JSONB_OBJECT_AGG(area, area_count)
32+
FROM
33+
(
34+
SELECT
35+
pf.area,
36+
COUNT(*) AS area_count
37+
FROM
38+
professor_areas pf
39+
WHERE
40+
pf.affiliation = u.institution
41+
GROUP BY
42+
pf.area
43+
) area_counts
44+
),
45+
'{}'::jsonb
46+
) AS research_areas
4147
FROM
4248
universities u
4349
LEFT JOIN professors p ON u.institution = p.affiliation
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
ALTER TABLE labs ADD COLUMN IF NOT EXISTS lab TEXT PRIMARY KEY;
2+
DO $$
3+
BEGIN
4+
IF NOT EXISTS (
5+
SELECT 1
6+
FROM pg_constraint
7+
WHERE conname = 'labs_lab_universities_institution_fk'
8+
) THEN
9+
ALTER TABLE labs
10+
ADD CONSTRAINT labs_lab_universities_institution_fk FOREIGN KEY (lab)
11+
REFERENCES universities (institution) ON DELETE SET NULL;
12+
END IF;
13+
END
14+
$$;

β€Žprojects/rank-nsf-linker/server/models.goβ€Ž

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,17 @@ type NsfJsonData struct {
8787
RawText string `json:"por_txt_cntn"`
8888
} `json:"por"`
8989
}
90+
91+
type LabModel struct {
92+
Insitution string
93+
StreetAddress string
94+
City string
95+
Phone string
96+
ZipCode string
97+
Country string
98+
Region string
99+
CountryAbbrv string
100+
Homepage string
101+
Latitude float64
102+
Longitude float64
103+
}

β€Žprojects/rank-nsf-linker/server/scripts/top_universities/missing_universities.txtβ€Ž

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ University of California--San Diego ---->>>> University Of California-San Diego
66
University of Wisconsin--Madison ---->>>> University of Wisconsin - Madison
77
University of California--Los Angeles ---->>>> The University Of California, Los Angeles
88
University of Maryland--College Park
9-
Purdue University--West Lafayette
10-
University of Massachusetts--Amherst
11-
University of California--Irvine
12-
University of California--Santa Barbara
13-
University of North Carolina--Chapel Hill
14-
University of Minnesota--Twin Cities
15-
Pennsylvania State University--University Park
9+
Purdue University--West Lafayette ---->>>> Purdue University
10+
University of Massachusetts--Amherst ---->>>> University Of Massachusetts, Amherst
11+
University of California--Irvine ---->>>> University Of California-Irvine
12+
University of California--Santa Barbara ---->>>> University Of California-Santa Barbara
13+
University of North Carolina--Chapel Hill ---->>>> University Of North Carolina At Chapel Hill
14+
University of Minnesota--Twin Cities ---->>>> University Of Minnesota-Twin Cities
15+
Pennsylvania State University--University Park ---->>>> Pennsylvania State Univ University Park
1616
Texas A&M University--College Station
1717
University of California--Davis
1818
University of Colorado--Boulder

0 commit comments

Comments
Β (0)