ScrapeSearch/extractors.py at main · NishK05/ScrapeSearch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from __future__ import annotations

import re
from typing import List, Optional, Tuple

from bs4 import BeautifulSoup


LAB_KEYWORDS = [
    "lab", "laboratory", "research", "group", "center", "centre", "institute",
]


def extract_lab_name(html: str) -> Optional[str]:
    soup = BeautifulSoup(html, "html.parser")
    # Prefer title
    if soup.title and soup.title.get_text(strip=True):
        title = soup.title.get_text(strip=True)
        # If title contains lab-like keywords, use it
        if any(kw in title.lower() for kw in LAB_KEYWORDS):
            return title
    # Try headings in order
    for tag in ["h1", "h2", "h3"]:
        for h in soup.find_all(tag):
            text = h.get_text(strip=True)
            if not text:
                continue
            if any(kw in text.lower() for kw in LAB_KEYWORDS):
                return text
    # Fallback: first non-empty h1/h2
    for tag in ["h1", "h2"]:
        h = soup.find(tag)
        if h and h.get_text(strip=True):
            return h.get_text(strip=True)
    return None


EMAIL_REGEX = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]*berkeley\.edu", re.I)


def extract_emails_from_text(text: str) -> List[str]:
    found = set(m.group(0).lower() for m in EMAIL_REGEX.finditer(text or ""))
    return sorted(found)


NAME_LIKE = re.compile(r"\b([A-Z][a-z]+\s+[A-Z][a-z]+)\b")


def looks_like_person_name(s: str) -> bool:
    if not s or len(s) > 60 or len(s) < 5:
        return False
    m = NAME_LIKE.search(s)
    return bool(m)


def extract_additional_names(html: str, limit: int = 10) -> List[str]:
    soup = BeautifulSoup(html, "html.parser")
    candidates: List[str] = []
    # Look for sections likely to list people
    section_selectors = [
        "section", "div", "ul", "ol",
    ]
    section_keywords = ["people", "members", "team", "students"]
    for sel in section_selectors:
        for node in soup.find_all(sel):
            text = (node.get_text(" ", strip=True) or "").lower()
            if not any(k in text for k in section_keywords):
                continue
            # Collect anchor texts that look like names
            for a in node.find_all("a"):
                t = a.get_text(strip=True) or ""
                if looks_like_person_name(t):
                    candidates.append(t)
            if len(candidates) >= limit:
                break
        if len(candidates) >= limit:
            break
    # Fallback: scan all anchors
    if len(candidates) < 3:
        for a in soup.find_all("a"):
            t = a.get_text(strip=True) or ""
            if looks_like_person_name(t):
                candidates.append(t)
            if len(candidates) >= limit:
                break
    # Deduplicate preserving order
    seen = set()
    out = []
    for n in candidates:
        if n not in seen:
            seen.add(n)
            out.append(n)
        if len(out) >= limit:
            break
    return out


def auto_generate_email_from_name(name: str) -> str:
    parts = [p for p in name.strip().split() if p]
    if not parts:
        return ""
    first = parts[0].lower()
    last = parts[-1].lower()
    # Try simple patterns; first viable wins
    candidates = [
        f"{first}.{last}@berkeley.edu",
        f"{first}{last}@berkeley.edu",
        f"{first[0]}{last}@berkeley.edu" if first else "",
    ]
    return next((c for c in candidates if c), "")