-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathvcf_parser.py
More file actions
151 lines (131 loc) · 5.54 KB
/
vcf_parser.py
File metadata and controls
151 lines (131 loc) · 5.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python3
"""
VCARD 3.0 Parser CLI
Usage:
python vcf_parser.py input1.vcf [input2.vcf ...] --output-dir ./csv_output
python vcf_parser.py contacts1.vcf contacts2.vcf --output-dir ./csv_out
"""
import re
import os
import argparse
import pandas as pd
from collections import defaultdict
def parse_vcards(vcard_text):
vcards = re.findall(r"BEGIN:VCARD(.*?)END:VCARD", vcard_text, re.DOTALL)
standard_fields = ["N", "FN", "ORG", "TITLE", "BDAY"]
type_sensitive_fields = ["TEL", "EMAIL", "ADR", "URL", "IMPP"]
all_fields = standard_fields + ["RelatedNames", "CustomDates"]
type_field_patterns = {
field: re.compile(rf'{field}(?:;[^:]+)*:(.+)', re.IGNORECASE)
for field in type_sensitive_fields
}
type_extract_pattern = re.compile(r';type=([^:;]+)', re.IGNORECASE)
apple_related_pattern = re.compile(r"item(\d+)\.X-ABRELATEDNAMES(?::|;type=[^:]*:)(.+)")
apple_label_pattern = re.compile(r"item(\d+)\.X-ABLabel(?::|;type=[^:]*:)(.+)")
apple_date_pattern = re.compile(r"item(\d+)\.X-ABDATE(?::|;type=[^:]*:)(.+)")
parsed_data = []
all_type_columns = set()
for block in vcards:
entry = defaultdict(list)
related_map = {}
label_map = {}
date_map = {}
field_type_map = defaultdict(list)
for line in block.strip().splitlines():
# Apple-related
if "X-ABRELATEDNAMES" in line:
match = apple_related_pattern.match(line)
if match:
idx, value = match.groups()
related_map[idx] = value.strip()
continue
if "X-ABLabel" in line:
match = apple_label_pattern.match(line)
if match:
idx, label = match.groups()
label_map[idx] = label.strip()
continue
if "X-ABDATE" in line:
match = apple_date_pattern.match(line)
if match:
idx, date = match.groups()
date_map[idx] = date.strip()
continue
for field in type_sensitive_fields:
if line.startswith(field):
match = type_field_patterns[field].search(line)
if match:
value = match.group(1).strip()
if field == "URL" and value.lower().startswith("ms-outlook://"):
continue
types = type_extract_pattern.findall(line)
if not types:
types = ["GENERIC"]
for t in types:
col = f"{field.upper()}-{t.upper()}"
field_type_map[col].append(value)
all_type_columns.add(col)
break
else:
for field in standard_fields:
if line.startswith(field):
try:
_, value = line.split(":", 1)
entry[field].append(value.strip())
except ValueError:
pass
# Apple: RelatedNames
relationships = []
for idx, name in related_map.items():
label = label_map.get(idx, "Related")
label_clean = re.sub(r"_\$!<(.*?)>!\$_", r"\1", label)
relationships.append(f"{label_clean}: {name}")
if relationships:
entry["RelatedNames"] = [" | ".join(relationships)]
# Apple: CustomDates
dates = []
for idx, date in date_map.items():
label = label_map.get(idx, "CustomDate")
label_clean = re.sub(r"_\$!<(.*?)>!\$_", r"\1", label)
dates.append(f"{label_clean}: {date}")
if dates:
entry["CustomDates"] = [" | ".join(dates)]
# Clean semicolons
if "N" in entry:
entry["N"] = [re.sub(r';{2,}', ';', n) for n in entry["N"]]
# Clean backslashes in ADR
for col in list(field_type_map.keys()):
if col.startswith("ADR-"):
field_type_map[col] = [re.sub(r'\\{2,}', r'\\', a) for a in field_type_map[col]]
# Merge everything
for col, values in field_type_map.items():
entry[col] = [" | ".join(values)]
parsed_data.append(entry)
rows = []
for entry in parsed_data:
flat = {}
for field in all_fields:
flat[field] = " | ".join(entry.get(field, []))
for col in all_type_columns:
flat[col] = " | ".join(entry.get(col, []))
rows.append(flat)
return pd.DataFrame(rows)
def main():
parser = argparse.ArgumentParser(description="VCARD 3.0 to CSV Parser")
parser.add_argument("input", nargs="+", help="Path(s) to .vcf file(s)")
parser.add_argument("--output-dir", "-o", required=True, help="Directory to save CSV files")
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
for input_path in args.input:
try:
with open(input_path, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
df = parse_vcards(text)
base_name = os.path.splitext(os.path.basename(input_path))[0]
output_path = os.path.join(args.output_dir, f"{base_name}.csv")
df.to_csv(output_path, index=False)
print(f"✅ Parsed: {input_path} → {output_path}")
except Exception as e:
print(f"❌ Failed to parse {input_path}: {e}")
if __name__ == "__main__":
main()