-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextractors.py
More file actions
112 lines (92 loc) · 3.31 KB
/
extractors.py
File metadata and controls
112 lines (92 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from __future__ import annotations
import re
from typing import List, Optional, Tuple
from bs4 import BeautifulSoup
LAB_KEYWORDS = [
"lab", "laboratory", "research", "group", "center", "centre", "institute",
]
def extract_lab_name(html: str) -> Optional[str]:
soup = BeautifulSoup(html, "html.parser")
# Prefer title
if soup.title and soup.title.get_text(strip=True):
title = soup.title.get_text(strip=True)
# If title contains lab-like keywords, use it
if any(kw in title.lower() for kw in LAB_KEYWORDS):
return title
# Try headings in order
for tag in ["h1", "h2", "h3"]:
for h in soup.find_all(tag):
text = h.get_text(strip=True)
if not text:
continue
if any(kw in text.lower() for kw in LAB_KEYWORDS):
return text
# Fallback: first non-empty h1/h2
for tag in ["h1", "h2"]:
h = soup.find(tag)
if h and h.get_text(strip=True):
return h.get_text(strip=True)
return None
EMAIL_REGEX = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]*berkeley\.edu", re.I)
def extract_emails_from_text(text: str) -> List[str]:
found = set(m.group(0).lower() for m in EMAIL_REGEX.finditer(text or ""))
return sorted(found)
NAME_LIKE = re.compile(r"\b([A-Z][a-z]+\s+[A-Z][a-z]+)\b")
def looks_like_person_name(s: str) -> bool:
if not s or len(s) > 60 or len(s) < 5:
return False
m = NAME_LIKE.search(s)
return bool(m)
def extract_additional_names(html: str, limit: int = 10) -> List[str]:
soup = BeautifulSoup(html, "html.parser")
candidates: List[str] = []
# Look for sections likely to list people
section_selectors = [
"section", "div", "ul", "ol",
]
section_keywords = ["people", "members", "team", "students"]
for sel in section_selectors:
for node in soup.find_all(sel):
text = (node.get_text(" ", strip=True) or "").lower()
if not any(k in text for k in section_keywords):
continue
# Collect anchor texts that look like names
for a in node.find_all("a"):
t = a.get_text(strip=True) or ""
if looks_like_person_name(t):
candidates.append(t)
if len(candidates) >= limit:
break
if len(candidates) >= limit:
break
# Fallback: scan all anchors
if len(candidates) < 3:
for a in soup.find_all("a"):
t = a.get_text(strip=True) or ""
if looks_like_person_name(t):
candidates.append(t)
if len(candidates) >= limit:
break
# Deduplicate preserving order
seen = set()
out = []
for n in candidates:
if n not in seen:
seen.add(n)
out.append(n)
if len(out) >= limit:
break
return out
def auto_generate_email_from_name(name: str) -> str:
parts = [p for p in name.strip().split() if p]
if not parts:
return ""
first = parts[0].lower()
last = parts[-1].lower()
# Try simple patterns; first viable wins
candidates = [
f"{first}.{last}@berkeley.edu",
f"{first}{last}@berkeley.edu",
f"{first[0]}{last}@berkeley.edu" if first else "",
]
return next((c for c in candidates if c), "")