-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract-all4.py
More file actions
181 lines (150 loc) · 7.17 KB
/
Copy pathextract-all4.py
File metadata and controls
181 lines (150 loc) · 7.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import win32com.client
import os
import re
#NSF_PATH = "FND-CHHAD-Reference-Libraryl.nsf"
NSF_PATH = "names.nsf"
LOTUS_PASSWORD = "" # If needed
OUTPUT_DIR = "output_geds"
# Which column index to parse for backslash-delimited categories?
CATEGORY_COLUMN_INDEX = 0
MAX_FOLDER_NAME_LENGTH = 100
def sanitize_folder_name(name, max_length=MAX_FOLDER_NAME_LENGTH):
"""Removes invalid characters and truncates for Windows-safe folder names."""
if not name or not name.strip():
return "Unnamed"
name = re.sub(r'[<>:"/\\|?*]', '_', name)
name = re.sub(r'[\s_]+', '_', name)
return name[:max_length].strip('_')
def get_document_subject(doc):
"""Return 'Subject' field, or fallback to 'Form' if needed."""
subject = None
for item in doc.Items:
if item.Name.lower() == "subject":
subject = item.Values[0] if item.Values else None
break
if not subject:
for item in doc.Items:
if item.Name.lower() == "form":
subject = f"Form_{item.Values[0]}" if item.Values else None
break
return subject or "UnnamedDocument"
def extract_document(doc, folder_path):
"""
Creates a subfolder named after doc subject + short UniversalID,
writes fields to 'document.txt', and extracts attachments robustly.
"""
subject = get_document_subject(doc)
try:
doc_id = doc.UniversalID[:8]
except Exception:
doc_id = "unknown"
doc_folder_name = sanitize_folder_name(f"{subject}_{doc_id}")
doc_folder_path = os.path.join(folder_path, doc_folder_name)
os.makedirs(doc_folder_path, exist_ok=True)
# Write all fields to a text file
text_file_path = os.path.join(doc_folder_path, "document.txt")
with open(text_file_path, "w", encoding="utf-8") as f:
f.write(f"----- Document: {subject} ({doc_id}) -----\n")
for item in doc.Items:
try:
f.write(f"{item.Name}: {item.Values}\n")
except Exception as e:
f.write(f"{item.Name}: <Error reading value: {e}>\n")
f.write("--------------------\n")
# Extract attachments (robust approach)
for item in doc.Items:
# Skip items with no EmbeddedObjects
if not hasattr(item, "EmbeddedObjects"):
continue
embedded_objects = item.EmbeddedObjects
if not embedded_objects:
continue
try:
# If it's a COM collection (has Count)
if hasattr(embedded_objects, "Count"):
for i in range(1, embedded_objects.Count + 1):
embedded_obj = embedded_objects.Item(i)
attachment_name = embedded_obj.Name or "UntitledAttachment"
safe_name = sanitize_folder_name(attachment_name)
attachment_path = os.path.join(doc_folder_path, safe_name)
try:
embedded_obj.ExtractFile(attachment_path)
print(f"Extracted attachment '{attachment_name}' to {attachment_path}")
except Exception as e:
print(f"Failed to extract attachment '{attachment_name}': {e}")
# Else if it's a Python iterable
elif hasattr(embedded_objects, "__iter__"):
for embedded_obj in embedded_objects:
attachment_name = embedded_obj.Name or "UntitledAttachment"
safe_name = sanitize_folder_name(attachment_name)
attachment_path = os.path.join(doc_folder_path, safe_name)
try:
embedded_obj.ExtractFile(attachment_path)
print(f"Extracted attachment '{attachment_name}' to {attachment_path}")
except Exception as e:
print(f"Failed to extract attachment '{attachment_name}': {e}")
else:
print(f"EmbeddedObjects in item '{item.Name}' is neither a COM collection nor an iterable.")
except Exception as e:
print(f"Error processing embedded objects in item '{item.Name}': {e}")
def extract_all_views_with_categories(password, nsf_path, output_dir="output_all_views_categories"):
"""
1) Enumerate ALL views in the NSF.
2) For each view:
- Create a folder named after the view.
- For each document entry, parse the first column for a backslash-delimited category path.
- Extract the doc under that category path, with a subfolder named after the doc's subject + short UID.
"""
session = win32com.client.Dispatch("Lotus.NotesSession")
session.Initialize(password)
db = session.GetDatabase("", nsf_path)
if not db.IsOpen:
raise Exception(f"Unable to open NSF at '{nsf_path}'")
os.makedirs(output_dir, exist_ok=True)
views = db.Views
print(f"[INFO] Found {len(views)} views in the database.\n")
view_count = 0
for view in views:
view_name = view.Name
# Optional: skip hidden/system views
# if view_name.startswith("(") or view_name.startswith("$"):
# continue
safe_view_name = sanitize_folder_name(view_name)
view_folder = os.path.join(output_dir, safe_view_name)
os.makedirs(view_folder, exist_ok=True)
print(f"[INFO] Processing view '{view_name}' -> folder '{safe_view_name}'")
all_entries = view.AllEntries
entry = all_entries.GetFirstEntry()
doc_count = 0
while entry:
next_entry = all_entries.GetNextEntry(entry)
if entry.IsDocument:
doc = entry.Document
if doc:
# Get the category path from the specified column
col_vals = entry.ColumnValues
if len(col_vals) > CATEGORY_COLUMN_INDEX:
cat_string = str(col_vals[CATEGORY_COLUMN_INDEX])
else:
cat_string = ""
cat_string = cat_string.strip()
if not cat_string:
cat_string = "Uncategorized"
# Split on backslash for multi-level categories
parts = [p.strip() for p in cat_string.split("\\") if p.strip()]
if not parts:
parts = ["Uncategorized"]
# Sanitize each part
parts = [sanitize_folder_name(p) for p in parts]
# Build final folder path: e.g. output/viewName/CatA/SubCatB
final_folder_path = os.path.join(view_folder, *parts)
os.makedirs(final_folder_path, exist_ok=True)
# Extract the doc
extract_document(doc, final_folder_path)
doc_count += 1
entry = next_entry
print(f"[INFO] Extracted {doc_count} documents from view '{view_name}'\n")
view_count += 1
print(f"[DONE] Processed {view_count} views total.")
if __name__ == '__main__':
extract_all_views_with_categories(LOTUS_PASSWORD, NSF_PATH, OUTPUT_DIR)