Linkedin_crawler/enrich_jd.py at main · Soli22de/Linkedin_crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Offline post-processor: regex-extract academic / duration / salary from
the jd_raw text of every row that has a substantive JD body.

Idempotent: only fills empty fields. Safe to re-run.

Usage:
  python enrich_jd.py
"""

import re
import sys

import pandas as pd

import config
from schema import SCHEMA


# ---------- regex banks ---------------------------------------------------

# Academic level patterns. We capture canonical buckets, not raw text.
ACADEMIC_PATTERNS = [
    # (canonical, regex)
    ("不限", re.compile(r"学历不限|不限学历|education unrestricted", re.I)),
    ("博士", re.compile(r"博士|PhD|Ph\.D|Doctorate", re.I)),
    ("硕士及以上", re.compile(r"硕士及以上|研究生及以上|Master\s*[(（]?\s*or above|Master.*above", re.I)),
    ("硕士", re.compile(r"硕士|研究生|Master[''']?s?|MSc|MA(?!\w)|graduate student", re.I)),
    ("本科及以上", re.compile(r"本科及以上|学士及以上|Bachelor.*above|undergrad.*or.*above", re.I)),
    ("本科", re.compile(r"本科|学士|Bachelor[''']?s?|BSc|BA(?!\w)|undergrad", re.I)),
    ("大专", re.compile(r"大专|大学专科|Associate\s+degree", re.I)),
    ("在校生", re.compile(r"在校生|在读|currently enrolled|current student", re.I)),
]

# Duration patterns. Capture string preserving the unit.
DURATION_PATTERNS = [
    re.compile(r"(\d+\s*(?:to|-|—|至|~)\s*\d+\s*(?:个月|months?))", re.I),
    re.compile(r"((?:至少|at least|minimum)\s*\d+\s*(?:个月|months?))", re.I),
    re.compile(r"(\d+\s*(?:个月|months?)\s*(?:及以上|以上|or more|or above)?)", re.I),
    re.compile(r"(\d+\s*(?:days?|天)\s*(?:/|每)?\s*(?:周|week|day))", re.I),
    re.compile(r"((?:至少|every|每周|至少每周)\s*\d+\s*(?:天|days?)\s*(?:/|每)?\s*(?:周|week)?)", re.I),
    re.compile(r"(暑期|寒假|长期|6\s*months?\s*onsite|实习\s*\d+\s*个?月?)", re.I),
]

# Salary patterns. CN intern is usually X-Y/day; some are USD or hourly.
SALARY_PATTERNS = [
    re.compile(r"(\d+\s*[-—~]\s*\d+\s*(?:元|RMB|¥|￥)?\s*/\s*天)"),
    re.compile(r"(\d+\s*[-—~]\s*\d+\s*(?:元|RMB|¥|￥)\s*(?:/\s*月)?)"),
    re.compile(r"((?:¥|￥|RMB)\s*\d+\s*[-—~]?\s*\d*\s*(?:/\s*天)?)"),
    re.compile(r"(\$\s*\d+(?:[.,]\d+)?\s*[-—~]?\s*\d*(?:[.,]\d+)?\s*(?:/\s*(?:hr|hour))?)"),
    re.compile(r"(\d+\s*[-—~]\s*\d+\s*[Kk]\s*(?:/月)?)"),
    re.compile(r"(\d+\s*USD\s*/\s*\w+)"),
]


def first_match(patterns, text):
    """Return the first regex group(1) match across the patterns list, else ''."""
    for pat in patterns:
        m = pat.search(text)
        if m:
            return m.group(1).strip() if m.lastindex else m.group(0).strip()
    return ""


def detect_academic(text):
    for canonical, pat in ACADEMIC_PATTERNS:
        if pat.search(text):
            return canonical
    return ""


def detect_duration(text):
    raw = first_match(DURATION_PATTERNS, text)
    return re.sub(r"\s+", " ", raw)[:60]


def detect_salary(text):
    raw = first_match(SALARY_PATTERNS, text)
    return re.sub(r"\s+", " ", raw)[:60]


def main() -> int:
    if not config.RAW_CSV.exists():
        print(f"[FATAL] No CSV at {config.RAW_CSV}")
        return 1

    df = pd.read_csv(config.RAW_CSV, encoding="utf-8-sig", dtype=str).fillna("")
    print(f"[INFO] CSV: {len(df)} rows")

    before = {col: (df[col].astype(str).str.len() > 0).sum() for col in
              ["academic", "duration", "salary"]}

    n_jd = (df["jd_raw"].astype(str).str.len() > 100).sum()
    print(f"[INFO] {n_jd} rows have substantive JD (>100 chars)")

    rows = df.to_dict(orient="records")
    for row in rows:
        jd = str(row.get("jd_raw", ""))
        if len(jd) < 50:
            continue
        if not row.get("academic"):
            row["academic"] = detect_academic(jd)
        if not row.get("duration"):
            row["duration"] = detect_duration(jd)
        if not row.get("salary"):
            row["salary"] = detect_salary(jd)

    out_df = pd.DataFrame(rows, columns=SCHEMA)
    out_df.to_csv(config.RAW_CSV, index=False, encoding="utf-8-sig")

    after = {col: (out_df[col].astype(str).str.len() > 0).sum() for col in
             ["academic", "duration", "salary"]}

    print()
    print(f"  Field      before -> after  ({n_jd} rows have JD)")
    for col in ["academic", "duration", "salary"]:
        delta = after[col] - before[col]
        pct_of_jd = 100 * delta / max(n_jd, 1)
        print(f"  {col:9s} {before[col]:>4} -> {after[col]:>4}  (+{delta}, {pct_of_jd:.0f}% of JD-having rows)")

    print(f"\n[DONE] {config.RAW_CSV}")
    return 0


if __name__ == "__main__":
    sys.exit(main())