-
Notifications
You must be signed in to change notification settings - Fork 32
Expand file tree
/
Copy pathword_in_path_apply.py
More file actions
57 lines (48 loc) · 2.02 KB
/
word_in_path_apply.py
File metadata and controls
57 lines (48 loc) · 2.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import binascii
import csv
import dataclasses
import hashlib
import json
import os
import pathlib
from pathlib import Path
from download_data import TMP_DIR, get_file_scope
from meta_row import MetaRow, _meta_from_file
def migrate_repo(repo_id, new_repo_id):
print(repo_id, new_repo_id)
all_repo_items = pathlib.Path(f"tmp/{repo_id}").glob("**/*")
all_repo_files = [str(p) for p in all_repo_items if p.is_file() and not p.is_symlink()]
repo_files = {}
for full_path in all_repo_files:
short_path = os.path.relpath(full_path, f"{TMP_DIR}/{repo_id}/").replace('\\', '/')
file_id = hashlib.sha256(short_path.encode()).hexdigest()[:8]
repo_files[(new_repo_id,file_id)] = short_path
new_meta = []
meta_file = Path(f"meta/{new_repo_id}.csv")
for row in _meta_from_file(meta_file):
meta_row = MetaRow(row)
short_path = repo_files[(meta_row.RepoName,meta_row.FileID)].lower()
file_path_name, file_extension = os.path.splitext(short_path)
file_scope = get_file_scope(file_path_name)
file_extension = file_extension.lower()
meta_row.FilePath=f"data/{new_repo_id}{file_scope}{meta_row.FileID}{file_extension}"
# workaround to keep empty cells instead '-1'
if 0 > meta_row.ValueStart:
meta_row.ValueStart = None
if 0 > meta_row.ValueEnd:
meta_row.ValueEnd = None
new_meta.append(meta_row)
with open(meta_file, 'w', newline='\n') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=dataclasses.asdict(new_meta[0]).keys(), lineterminator='\n')
writer.writeheader()
for row in new_meta:
writer.writerow(dataclasses.asdict(row))
def main():
with open("snapshot.json", encoding="utf_8") as f:
snapshot = json.load(f)
for repo_id, repo_url in snapshot.items():
repo_id_bytes = binascii.unhexlify(repo_id)
new_repo_id = f"{binascii.crc32(repo_id_bytes):08x}"
migrate_repo(repo_id, new_repo_id)
if __name__ == """__main__""":
main()