|
1 | | - |
2 | 1 | # Copyright 2023-2024 llmware |
3 | 2 |
|
4 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you |
@@ -65,14 +64,22 @@ def __init__(self, library): |
65 | 64 | self.pre_initialization_bow_data = {} |
66 | 65 | self.post_initialization_bow_data = {} |
67 | 66 |
|
68 | | - # create stop words txt file in nlp path |
69 | | - self.stop_words = Utilities().load_stop_words_list(self.library.nlp_path) |
| 67 | + # Load stop words with error handling |
| 68 | + try: |
| 69 | + self.stop_words = Utilities().load_stop_words_list(self.library.nlp_path) |
| 70 | + except Exception as e: |
| 71 | + logger.error(f"Failed to load stop words: {e}") |
| 72 | + self.stop_words = [] |
70 | 73 |
|
71 | | - # load graph c modules - note: if any issues loading module, will be captured in get_module_graph_functions() |
72 | | - self._mod_utility = Utilities().get_module_graph_functions() |
| 74 | + # Load graph C modules with error handling |
| 75 | + try: |
| 76 | + self._mod_utility = Utilities().get_module_graph_functions() |
| 77 | + except Exception as e: |
| 78 | + logger.error(f"Failed to load graph utility module: {e}") |
| 79 | + self._mod_utility = None |
73 | 80 |
|
74 | 81 | # new method - used to track 'counter' inside the bow files for incremental read/write/analysis |
75 | | - def bow_locator(self): |
| 82 | + def bow_locator(self) -> tuple: |
76 | 83 |
|
77 | 84 | """ Internal utility method used to enable scalability across multiple underlying BOW (Bag-of-Word) |
78 | 85 | files which are created by the graph module. """ |
@@ -103,16 +110,26 @@ def bow_locator(self): |
103 | 110 | f"{top_bow_file}") |
104 | 111 | bow_index = 0 |
105 | 112 |
|
106 | | - fp = open(os.path.join(dataset_fp, top_bow_file), "r", encoding='utf-8') |
107 | | - fp.seek(0, 2) |
108 | | - bow_byte_index = fp.tell() |
109 | | - fp.seek(0, 0) # rewind |
110 | | - bow_tokens = len(fp.read().split(",")) |
111 | | - fp.close() |
| 113 | + try: |
| 114 | + fp = open(os.path.join(dataset_fp, top_bow_file), "r", encoding='utf-8') |
| 115 | + fp.seek(0, 2) |
| 116 | + bow_byte_index = fp.tell() |
| 117 | + fp.seek(0, 0) # rewind |
| 118 | + bow_tokens = len(fp.read().split(",")) |
| 119 | + fp.close() |
| 120 | + except FileNotFoundError: |
| 121 | + logger.error(f"BOW file not found: {top_bow_file}") |
| 122 | + return 0, 0, 0, [], True |
| 123 | + except Exception as e: |
| 124 | + logger.error(f"Error reading BOW file: {e}") |
| 125 | + return 0, 0, 0, [], True |
| 126 | + finally: |
| 127 | + if 'fp' in locals(): |
| 128 | + fp.close() |
112 | 129 |
|
113 | 130 | return bow_index, bow_byte_index, bow_tokens, bow_files, no_bow |
114 | 131 |
|
115 | | - def build_graph(self): |
| 132 | + def build_graph(self) -> dict: |
116 | 133 |
|
117 | 134 | """ Generates multiple valuable nlp artifacts in the library's /nlp folder path, with the |
118 | 135 | primary objective of generating the co-occurrence matrix. """ |
@@ -186,9 +203,11 @@ def build_graph(self): |
186 | 203 | graph_summary.update({"time_stamp": ts}) |
187 | 204 |
|
188 | 205 | # write to manifest.json for knowledge graph |
189 | | - json_dict = json.dumps(graph_summary,indent=2) |
190 | | - with open(os.path.join(self.library.nlp_path,"manifest.json"),"w", encoding='utf-8') as outfile: |
191 | | - outfile.write(json_dict) |
| 206 | + try: |
| 207 | + with open(os.path.join(self.library.nlp_path,"manifest.json"), "w", encoding='utf-8') as outfile: |
| 208 | + outfile.write(json.dumps(graph_summary, indent=2)) |
| 209 | + except Exception as e: |
| 210 | + logger.error(f"Failed to write manifest.json: {e}") |
192 | 211 |
|
193 | 212 | return graph_summary |
194 | 213 |
|
@@ -833,16 +852,25 @@ def get_unique_vocab_len(self): |
833 | 852 |
|
834 | 853 | return len(self.get_unique_vocab_lookup()) |
835 | 854 |
|
836 | | - def get_unique_vocab_lookup(self): |
| 855 | + def get_unique_vocab_lookup(self) -> dict: |
837 | 856 |
|
838 | 857 | """ Returns the unique vocab list found in the Library corpus. """ |
839 | 858 |
|
840 | 859 | if self.library.get_knowledge_graph_status() != "yes": |
841 | 860 | self.build_graph() |
842 | 861 |
|
843 | | - j = json.load(open(os.path.join(self.library.nlp_path,"vocab_lookup.json"), "r", encoding='utf-8')) |
844 | | - |
845 | | - return j |
| 862 | + try: |
| 863 | + with open(os.path.join(self.library.nlp_path, "vocab_lookup.json"), "r", encoding='utf-8') as file: |
| 864 | + return json.load(file) |
| 865 | + except FileNotFoundError: |
| 866 | + logger.error("vocab_lookup.json file not found.") |
| 867 | + return {} |
| 868 | + except json.JSONDecodeError: |
| 869 | + logger.error("Error decoding JSON from vocab_lookup.json.") |
| 870 | + return {} |
| 871 | + except Exception as e: |
| 872 | + logger.error(f"Unexpected error: {e}") |
| 873 | + return {} |
846 | 874 |
|
847 | 875 | def get_unique_vocab_reverse_lookup(self): |
848 | 876 |
|
|
0 commit comments