lotus_leap/extract-all4.py at main · hc-sc-ocdo-bdpd/lotus_leap · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import win32com.client
import os
import re

#NSF_PATH = "FND-CHHAD-Reference-Libraryl.nsf"
NSF_PATH = "names.nsf"
LOTUS_PASSWORD = ""  # If needed
OUTPUT_DIR = "output_geds"

# Which column index to parse for backslash-delimited categories?
CATEGORY_COLUMN_INDEX = 0

MAX_FOLDER_NAME_LENGTH = 100

def sanitize_folder_name(name, max_length=MAX_FOLDER_NAME_LENGTH):
    """Removes invalid characters and truncates for Windows-safe folder names."""
    if not name or not name.strip():
        return "Unnamed"
    name = re.sub(r'[<>:"/\\|?*]', '_', name)
    name = re.sub(r'[\s_]+', '_', name)
    return name[:max_length].strip('_')

def get_document_subject(doc):
    """Return 'Subject' field, or fallback to 'Form' if needed."""
    subject = None
    for item in doc.Items:
        if item.Name.lower() == "subject":
            subject = item.Values[0] if item.Values else None
            break
    if not subject:
        for item in doc.Items:
            if item.Name.lower() == "form":
                subject = f"Form_{item.Values[0]}" if item.Values else None
                break
    return subject or "UnnamedDocument"

def extract_document(doc, folder_path):
    """
    Creates a subfolder named after doc subject + short UniversalID,
    writes fields to 'document.txt', and extracts attachments robustly.
    """
    subject = get_document_subject(doc)
    try:
        doc_id = doc.UniversalID[:8]
    except Exception:
        doc_id = "unknown"

    doc_folder_name = sanitize_folder_name(f"{subject}_{doc_id}")
    doc_folder_path = os.path.join(folder_path, doc_folder_name)
    os.makedirs(doc_folder_path, exist_ok=True)

    # Write all fields to a text file
    text_file_path = os.path.join(doc_folder_path, "document.txt")
    with open(text_file_path, "w", encoding="utf-8") as f:
        f.write(f"----- Document: {subject} ({doc_id}) -----\n")
        for item in doc.Items:
            try:
                f.write(f"{item.Name}: {item.Values}\n")
            except Exception as e:
                f.write(f"{item.Name}: <Error reading value: {e}>\n")
        f.write("--------------------\n")

    # Extract attachments (robust approach)
    for item in doc.Items:
        # Skip items with no EmbeddedObjects
        if not hasattr(item, "EmbeddedObjects"):
            continue

        embedded_objects = item.EmbeddedObjects
        if not embedded_objects:
            continue

        try:
            # If it's a COM collection (has Count)
            if hasattr(embedded_objects, "Count"):
                for i in range(1, embedded_objects.Count + 1):
                    embedded_obj = embedded_objects.Item(i)
                    attachment_name = embedded_obj.Name or "UntitledAttachment"
                    safe_name = sanitize_folder_name(attachment_name)
                    attachment_path = os.path.join(doc_folder_path, safe_name)
                    try:
                        embedded_obj.ExtractFile(attachment_path)
                        print(f"Extracted attachment '{attachment_name}' to {attachment_path}")
                    except Exception as e:
                        print(f"Failed to extract attachment '{attachment_name}': {e}")

            # Else if it's a Python iterable
            elif hasattr(embedded_objects, "__iter__"):
                for embedded_obj in embedded_objects:
                    attachment_name = embedded_obj.Name or "UntitledAttachment"
                    safe_name = sanitize_folder_name(attachment_name)
                    attachment_path = os.path.join(doc_folder_path, safe_name)
                    try:
                        embedded_obj.ExtractFile(attachment_path)
                        print(f"Extracted attachment '{attachment_name}' to {attachment_path}")
                    except Exception as e:
                        print(f"Failed to extract attachment '{attachment_name}': {e}")

            else:
                print(f"EmbeddedObjects in item '{item.Name}' is neither a COM collection nor an iterable.")
        except Exception as e:
            print(f"Error processing embedded objects in item '{item.Name}': {e}")

def extract_all_views_with_categories(password, nsf_path, output_dir="output_all_views_categories"):
    """
    1) Enumerate ALL views in the NSF.
    2) For each view:
       - Create a folder named after the view.
       - For each document entry, parse the first column for a backslash-delimited category path.
       - Extract the doc under that category path, with a subfolder named after the doc's subject + short UID.
    """
    session = win32com.client.Dispatch("Lotus.NotesSession")
    session.Initialize(password)

    db = session.GetDatabase("", nsf_path)
    if not db.IsOpen:
        raise Exception(f"Unable to open NSF at '{nsf_path}'")

    os.makedirs(output_dir, exist_ok=True)

    views = db.Views
    print(f"[INFO] Found {len(views)} views in the database.\n")

    view_count = 0
    for view in views:
        view_name = view.Name
        # Optional: skip hidden/system views
        # if view_name.startswith("(") or view_name.startswith("$"):
        #     continue

        safe_view_name = sanitize_folder_name(view_name)
        view_folder = os.path.join(output_dir, safe_view_name)
        os.makedirs(view_folder, exist_ok=True)

        print(f"[INFO] Processing view '{view_name}' -> folder '{safe_view_name}'")

        all_entries = view.AllEntries
        entry = all_entries.GetFirstEntry()
        doc_count = 0

        while entry:
            next_entry = all_entries.GetNextEntry(entry)
            if entry.IsDocument:
                doc = entry.Document
                if doc:
                    # Get the category path from the specified column
                    col_vals = entry.ColumnValues
                    if len(col_vals) > CATEGORY_COLUMN_INDEX:
                        cat_string = str(col_vals[CATEGORY_COLUMN_INDEX])
                    else:
                        cat_string = ""

                    cat_string = cat_string.strip()
                    if not cat_string:
                        cat_string = "Uncategorized"

                    # Split on backslash for multi-level categories
                    parts = [p.strip() for p in cat_string.split("\\") if p.strip()]
                    if not parts:
                        parts = ["Uncategorized"]

                    # Sanitize each part
                    parts = [sanitize_folder_name(p) for p in parts]

                    # Build final folder path: e.g. output/viewName/CatA/SubCatB
                    final_folder_path = os.path.join(view_folder, *parts)
                    os.makedirs(final_folder_path, exist_ok=True)

                    # Extract the doc
                    extract_document(doc, final_folder_path)
                    doc_count += 1

            entry = next_entry

        print(f"[INFO] Extracted {doc_count} documents from view '{view_name}'\n")
        view_count += 1

    print(f"[DONE] Processed {view_count} views total.")

if __name__ == '__main__':
    extract_all_views_with_categories(LOTUS_PASSWORD, NSF_PATH, OUTPUT_DIR)