-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexport_words.py
More file actions
105 lines (92 loc) · 3.82 KB
/
export_words.py
File metadata and controls
105 lines (92 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import json
from ziman.models import WordDB
async def export_words():
export_directory = "words"
os.makedirs(export_directory, exist_ok=True)
file_handlers = {}
file_indices = {}
words_per_file = 100
word_counts = {} # New dictionary to track word counts for each letter
# Initialize Tortoise ORM
# Get total count of words
total_words = await WordDB.filter(is_confirmed=True).count()
batch_size = 100
for offset in range(0, total_words, batch_size):
# Fetch words in batches
words = (
await WordDB.filter(is_confirmed=True)
.prefetch_related(
"created_by",
"definitions__created_by",
"definitions__sentences__created_by",
)
.order_by("updated_at")
.offset(offset)
.limit(batch_size)
)
for word in words:
first_letter = word.content[0].lower()
if first_letter not in word_counts:
word_counts[first_letter] = 0
if (
word_counts[first_letter] % words_per_file == 0
or first_letter not in file_handlers
):
if first_letter in file_handlers:
file_handlers[first_letter].write("\n]")
file_handlers[first_letter].close()
if first_letter not in file_indices:
file_indices[first_letter] = 1
else:
file_indices[first_letter] += 1
file_path = os.path.join(
export_directory,
f"words_{first_letter}_{file_indices[first_letter]}.json",
)
file_handlers[first_letter] = open(file_path, "w", encoding="utf-8")
file_handlers[first_letter].write("[\n")
fetched_word = {
"author": {
"firstname": word.created_by.firstname,
"lastname": word.created_by.lastname,
},
"content": word.content,
"definitions": [],
}
for db_word_definition in word.definitions:
fetched_word["definitions"].append(
{
"content": db_word_definition.content,
"type": db_word_definition.type.value,
"author": {
"firstname": db_word_definition.created_by.firstname,
"lastname": db_word_definition.created_by.lastname,
},
"sentences": [
{
"content": sentence.content,
"tense": sentence.tense.value,
"word_form": sentence.word_form,
"author": {
"firstname": sentence.created_by.firstname,
"lastname": sentence.created_by.lastname,
},
}
for sentence in db_word_definition.sentences
],
}
)
json.dump(
fetched_word, file_handlers[first_letter], ensure_ascii=False, indent=2
)
file_handlers[first_letter].write(",\n")
file_handlers[first_letter].flush()
word_counts[first_letter] += 1 # Increment the word count for this letter
for handler in file_handlers.values():
handler.seek(handler.tell() - 2) # Remove the last comma and newline
handler.write("\n]")
handler.close()
print(
f"Exported {total_words} words to separate files in the '{export_directory}' directory."
)