-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathenrich_jd.py
More file actions
126 lines (96 loc) · 4.54 KB
/
enrich_jd.py
File metadata and controls
126 lines (96 loc) · 4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Offline post-processor: regex-extract academic / duration / salary from
the jd_raw text of every row that has a substantive JD body.
Idempotent: only fills empty fields. Safe to re-run.
Usage:
python enrich_jd.py
"""
import re
import sys
import pandas as pd
import config
from schema import SCHEMA
# ---------- regex banks ---------------------------------------------------
# Academic level patterns. We capture canonical buckets, not raw text.
ACADEMIC_PATTERNS = [
# (canonical, regex)
("不限", re.compile(r"学历不限|不限学历|education unrestricted", re.I)),
("博士", re.compile(r"博士|PhD|Ph\.D|Doctorate", re.I)),
("硕士及以上", re.compile(r"硕士及以上|研究生及以上|Master\s*[((]?\s*or above|Master.*above", re.I)),
("硕士", re.compile(r"硕士|研究生|Master[''']?s?|MSc|MA(?!\w)|graduate student", re.I)),
("本科及以上", re.compile(r"本科及以上|学士及以上|Bachelor.*above|undergrad.*or.*above", re.I)),
("本科", re.compile(r"本科|学士|Bachelor[''']?s?|BSc|BA(?!\w)|undergrad", re.I)),
("大专", re.compile(r"大专|大学专科|Associate\s+degree", re.I)),
("在校生", re.compile(r"在校生|在读|currently enrolled|current student", re.I)),
]
# Duration patterns. Capture string preserving the unit.
DURATION_PATTERNS = [
re.compile(r"(\d+\s*(?:to|-|—|至|~)\s*\d+\s*(?:个月|months?))", re.I),
re.compile(r"((?:至少|at least|minimum)\s*\d+\s*(?:个月|months?))", re.I),
re.compile(r"(\d+\s*(?:个月|months?)\s*(?:及以上|以上|or more|or above)?)", re.I),
re.compile(r"(\d+\s*(?:days?|天)\s*(?:/|每)?\s*(?:周|week|day))", re.I),
re.compile(r"((?:至少|every|每周|至少每周)\s*\d+\s*(?:天|days?)\s*(?:/|每)?\s*(?:周|week)?)", re.I),
re.compile(r"(暑期|寒假|长期|6\s*months?\s*onsite|实习\s*\d+\s*个?月?)", re.I),
]
# Salary patterns. CN intern is usually X-Y/day; some are USD or hourly.
SALARY_PATTERNS = [
re.compile(r"(\d+\s*[-—~]\s*\d+\s*(?:元|RMB|¥|¥)?\s*/\s*天)"),
re.compile(r"(\d+\s*[-—~]\s*\d+\s*(?:元|RMB|¥|¥)\s*(?:/\s*月)?)"),
re.compile(r"((?:¥|¥|RMB)\s*\d+\s*[-—~]?\s*\d*\s*(?:/\s*天)?)"),
re.compile(r"(\$\s*\d+(?:[.,]\d+)?\s*[-—~]?\s*\d*(?:[.,]\d+)?\s*(?:/\s*(?:hr|hour))?)"),
re.compile(r"(\d+\s*[-—~]\s*\d+\s*[Kk]\s*(?:/月)?)"),
re.compile(r"(\d+\s*USD\s*/\s*\w+)"),
]
def first_match(patterns, text):
"""Return the first regex group(1) match across the patterns list, else ''."""
for pat in patterns:
m = pat.search(text)
if m:
return m.group(1).strip() if m.lastindex else m.group(0).strip()
return ""
def detect_academic(text):
for canonical, pat in ACADEMIC_PATTERNS:
if pat.search(text):
return canonical
return ""
def detect_duration(text):
raw = first_match(DURATION_PATTERNS, text)
return re.sub(r"\s+", " ", raw)[:60]
def detect_salary(text):
raw = first_match(SALARY_PATTERNS, text)
return re.sub(r"\s+", " ", raw)[:60]
def main() -> int:
if not config.RAW_CSV.exists():
print(f"[FATAL] No CSV at {config.RAW_CSV}")
return 1
df = pd.read_csv(config.RAW_CSV, encoding="utf-8-sig", dtype=str).fillna("")
print(f"[INFO] CSV: {len(df)} rows")
before = {col: (df[col].astype(str).str.len() > 0).sum() for col in
["academic", "duration", "salary"]}
n_jd = (df["jd_raw"].astype(str).str.len() > 100).sum()
print(f"[INFO] {n_jd} rows have substantive JD (>100 chars)")
rows = df.to_dict(orient="records")
for row in rows:
jd = str(row.get("jd_raw", ""))
if len(jd) < 50:
continue
if not row.get("academic"):
row["academic"] = detect_academic(jd)
if not row.get("duration"):
row["duration"] = detect_duration(jd)
if not row.get("salary"):
row["salary"] = detect_salary(jd)
out_df = pd.DataFrame(rows, columns=SCHEMA)
out_df.to_csv(config.RAW_CSV, index=False, encoding="utf-8-sig")
after = {col: (out_df[col].astype(str).str.len() > 0).sum() for col in
["academic", "duration", "salary"]}
print()
print(f" Field before -> after ({n_jd} rows have JD)")
for col in ["academic", "duration", "salary"]:
delta = after[col] - before[col]
pct_of_jd = 100 * delta / max(n_jd, 1)
print(f" {col:9s} {before[col]:>4} -> {after[col]:>4} (+{delta}, {pct_of_jd:.0f}% of JD-having rows)")
print(f"\n[DONE] {config.RAW_CSV}")
return 0
if __name__ == "__main__":
sys.exit(main())