Skip to content

Commit 6285196

Browse files
Merge pull request #136 from unicef/feature/name-parser
Add ML name parser
2 parents a18e8c9 + ed44c0e commit 6285196

File tree

13 files changed

+2810
-865
lines changed

13 files changed

+2810
-865
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,5 @@ black.txt
3333
flake8
3434
act.*
3535
root/
36+
/src/country_workspace/data/name_parser/input/
37+
/src/country_workspace/data/name_parser/tmp/

pyproject.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,19 @@ dependencies = [
4646
"hope-flex-fields>=0.6.2",
4747
"hope-smart-export>=0.3",
4848
"hope-smart-import>=0.3",
49+
"notebook>=7.4.3",
50+
"numpy>=2.3",
4951
"openpyxl>=3.1.5",
52+
"pandas>=2.3",
5053
"pillow>=11.2.1",
54+
"pre-commit>=4.2",
5155
"psycopg2-binary>=2.9.9",
5256
"python-redis-lock[django]>=4",
5357
"redis",
58+
"scikit-learn>=1.7",
5459
"sentry-sdk>=2.7.1",
5560
"social-auth-app-django",
61+
"torch>=2.7.1",
5662
"unicef-security>=1.5.1",
5763
]
5864
scripts.celery-monitor = "country_workspace.__monitor__:run"

src/country_workspace/contrib/name_parser/__init__.py

Whitespace-only changes.
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "initial_id",
7+
"metadata": {
8+
"collapsed": true
9+
},
10+
"outputs": [],
11+
"source": [
12+
"import pandas as pd"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": null,
18+
"id": "a2dc0c03c80eea0b",
19+
"metadata": {},
20+
"outputs": [],
21+
"source": [
22+
"COUNTRY_CODE = \"UKR\"\n",
23+
"NAME_TYPES = [\"given_name\", \"middle_name\", \"family_name\"]"
24+
]
25+
},
26+
{
27+
"cell_type": "code",
28+
"execution_count": null,
29+
"id": "1e6195d85d2779fb",
30+
"metadata": {},
31+
"outputs": [],
32+
"source": [
33+
"name_df = pd.read_csv(f\"../../../data/name_parser/input/{COUNTRY_CODE}.csv\", sep=\";\", usecols=NAME_TYPES)\n",
34+
"name_df.head()"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": null,
40+
"id": "f960c133714b1dad",
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"name_df.shape"
45+
]
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": null,
50+
"id": "9bbd412ff9cb84b8",
51+
"metadata": {},
52+
"outputs": [],
53+
"source": [
54+
"name_dfs = []\n",
55+
"for name in NAME_TYPES:\n",
56+
" name_df = name_df[name].rename(\"name\").to_frame()\n",
57+
" name_df[\"type\"] = name\n",
58+
" name_dfs.append(name_df)"
59+
]
60+
},
61+
{
62+
"cell_type": "code",
63+
"execution_count": null,
64+
"id": "7dc5ff11aad0049",
65+
"metadata": {},
66+
"outputs": [],
67+
"source": [
68+
"name_df = pd.concat(name_dfs)\n",
69+
"name_df.shape"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": null,
75+
"id": "6e49f394945497a4",
76+
"metadata": {},
77+
"outputs": [],
78+
"source": [
79+
"name_df[\"name\"] = name_df[\"name\"].str.replace(r\"[^-\\w' ]\", \"\", regex=True)\n",
80+
"name_df.head()"
81+
]
82+
},
83+
{
84+
"cell_type": "code",
85+
"execution_count": null,
86+
"id": "4296d2039d6ed809",
87+
"metadata": {},
88+
"outputs": [],
89+
"source": [
90+
"name_df[\"name\"] = name_df[\"name\"].str.strip().str.title()\n",
91+
"name_df = name_df.dropna()\n",
92+
"name_df = name_df.drop_duplicates()\n",
93+
"name_df.shape"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": null,
99+
"id": "ff915f8d1f7d5409",
100+
"metadata": {},
101+
"outputs": [],
102+
"source": [
103+
"name_df"
104+
]
105+
},
106+
{
107+
"cell_type": "code",
108+
"execution_count": null,
109+
"id": "afda3a63d02324b",
110+
"metadata": {},
111+
"outputs": [],
112+
"source": [
113+
"name_df.to_csv(f\"../../../data/name_parser/tmp/{COUNTRY_CODE}.csv\", index=False)"
114+
]
115+
},
116+
{
117+
"cell_type": "code",
118+
"execution_count": null,
119+
"id": "ffbaef8d4acd17cd",
120+
"metadata": {},
121+
"outputs": [],
122+
"source": []
123+
}
124+
],
125+
"metadata": {
126+
"kernelspec": {
127+
"display_name": "Python 3",
128+
"language": "python",
129+
"name": "python3"
130+
},
131+
"language_info": {
132+
"codemirror_mode": {
133+
"name": "ipython",
134+
"version": 2
135+
},
136+
"file_extension": ".py",
137+
"mimetype": "text/x-python",
138+
"name": "python",
139+
"nbconvert_exporter": "python",
140+
"pygments_lexer": "ipython2",
141+
"version": "2.7.6"
142+
}
143+
},
144+
"nbformat": 4,
145+
"nbformat_minor": 5
146+
}

0 commit comments

Comments
 (0)