Skip to content

Commit 7ce13ff

Browse files
authored
Merge pull request #219 from hc-sc-ocdo-bdpd/more-code-file-types
go, rb, cpp, h, javascript
2 parents 320f03d + 0d0f4f5 commit 7ce13ff

12 files changed

Lines changed: 561 additions & 0 deletions

file_processing/file.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,15 @@ class File:
3030
TRANSCRIPTION_APPLICABLE_EXTENSIONS = {".mp3", ".wav", ".mp4", ".flac", ".aiff", ".ogg"}
3131

3232
PROCESSORS = {
33+
".cpp": processors.CppFileProcessor,
34+
".cc": processors.CppFileProcessor,
3335
".csv": processors.CsvFileProcessor,
36+
".js": processors.JsFileProcessor,
3437
".txt": processors.TextFileProcessor,
3538
".pdf": processors.PdfFileProcessor,
3639
".docx": processors.DocxFileProcessor,
40+
".h": processors.HFileProcessor,
41+
".go": processors.GoFileProcessor,
3742
".msg": processors.MsgFileProcessor,
3843
".pptx": processors.PptxFileProcessor,
3944
".rtf": processors.RtfFileProcessor,
@@ -50,6 +55,7 @@ class File:
5055
".wav": processors.AudioFileProcessor,
5156
".mp4": processors.AudioFileProcessor,
5257
".flac": processors.AudioFileProcessor,
58+
".rb": processors.RbFileProcessor,
5359
".aiff": processors.AudioFileProcessor,
5460
".ogg": processors.AudioFileProcessor,
5561
".py": processors.PyFileProcessor,

file_processing/processors/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from .txt_processor import TextFileProcessor
1818
from .pdf_processor import PdfFileProcessor
1919
from .docx_processor import DocxFileProcessor
20+
from .h_processor import HFileProcessor
2021
from .msg_processor import MsgFileProcessor
2122
from .png_processor import PngFileProcessor
2223
from .xlsx_processor import XlsxFileProcessor
@@ -41,3 +42,7 @@
4142
from .exe_processor import ExeFileProcessor
4243
from .whl_processor import WhlFileProcessor
4344
from .java_processor import JavaFileProcessor
45+
from .go_processor import GoFileProcessor
46+
from .rb_processor import RbFileProcessor
47+
from .cpp_processor import CppFileProcessor
48+
from .js_processor import JsFileProcessor
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import chardet
2+
import re
3+
from file_processing.errors import FileProcessingFailedError
4+
from file_processing.file_processor_strategy import FileProcessorStrategy
5+
6+
class CppFileProcessor(FileProcessorStrategy):
7+
"""
8+
Processor for handling C++ source files (.cpp, .cc), extracting metadata and content.
9+
10+
Attributes:
11+
metadata (dict): Contains metadata such as 'text', 'encoding', 'num_lines',
12+
'num_functions', 'num_classes', and 'num_comments'.
13+
"""
14+
15+
def __init__(self, file_path: str, open_file: bool = True) -> None:
16+
super().__init__(file_path, open_file)
17+
self.metadata = {'message': 'File was not opened'} if not open_file else {}
18+
19+
def process(self) -> None:
20+
if not self.open_file:
21+
return
22+
try:
23+
raw_data = open(self.file_path, 'rb').read()
24+
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
25+
26+
with open(self.file_path, 'r', encoding=encoding, errors='replace') as f:
27+
text = f.read()
28+
29+
num_lines = len(text.splitlines())
30+
31+
num_functions = len(re.findall(r'\b[a-zA-Z_]\w*\s+[a-zA-Z_]\w*\s*\(.*?\)\s*\{', text))
32+
num_classes = len(re.findall(r'\bclass\s+\w+', text))
33+
num_comments = len(re.findall(r'(//[^\n]*|/\*.*?\*/)', text, re.DOTALL))
34+
35+
self.metadata.update({
36+
'text': text,
37+
'encoding': encoding,
38+
'num_lines': num_lines,
39+
'num_functions': num_functions,
40+
'num_classes': num_classes,
41+
'num_comments': num_comments
42+
})
43+
44+
except Exception as e:
45+
raise FileProcessingFailedError(
46+
f"Error processing {self.file_path}: {e}"
47+
)
48+
49+
def save(self, output_path: str = None) -> None:
50+
try:
51+
save_path = output_path or self.file_path
52+
with open(save_path, 'w', encoding=self.metadata['encoding']) as f:
53+
f.write(self.metadata['text'])
54+
except Exception as e:
55+
raise FileProcessingFailedError(
56+
f"Error saving {self.file_path} to {save_path}: {e}"
57+
)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import chardet
2+
import re
3+
from file_processing.errors import FileProcessingFailedError
4+
from file_processing.file_processor_strategy import FileProcessorStrategy
5+
6+
class GoFileProcessor(FileProcessorStrategy):
7+
"""
8+
Processor for handling Go source files (.go), extracting metadata and content.
9+
10+
Attributes:
11+
metadata (dict): Contains metadata such as 'text', 'encoding', 'num_lines',
12+
'num_functions', 'num_structs', and 'num_interfaces'.
13+
"""
14+
15+
def __init__(self, file_path: str, open_file: bool = True) -> None:
16+
super().__init__(file_path, open_file)
17+
self.metadata = {'message': 'File was not opened'} if not open_file else {}
18+
19+
def process(self) -> None:
20+
if not self.open_file:
21+
return
22+
try:
23+
raw_data = open(self.file_path, 'rb').read()
24+
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
25+
26+
with open(self.file_path, 'r', encoding=encoding) as f:
27+
text = f.read()
28+
29+
num_lines = len(text.splitlines())
30+
31+
num_functions = len(re.findall(r'\bfunc\s+\w+\s*\(', text))
32+
num_structs = len(re.findall(r'\btype\s+\w+\s+struct\s*\{', text))
33+
num_interfaces = len(re.findall(r'\btype\s+\w+\s+interface\s*\{', text))
34+
35+
self.metadata.update({
36+
'text': text,
37+
'encoding': encoding,
38+
'num_lines': num_lines,
39+
'num_functions': num_functions,
40+
'num_structs': num_structs,
41+
'num_interfaces': num_interfaces
42+
})
43+
44+
except Exception as e:
45+
raise FileProcessingFailedError(
46+
f"Error processing {self.file_path}: {e}"
47+
)
48+
49+
def save(self, output_path: str = None) -> None:
50+
try:
51+
save_path = output_path or self.file_path
52+
with open(save_path, 'w', encoding=self.metadata['encoding']) as f:
53+
f.write(self.metadata['text'])
54+
except Exception as e:
55+
raise FileProcessingFailedError(
56+
f"Error saving {self.file_path} to {save_path}: {e}"
57+
)
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import chardet
2+
import re
3+
from file_processing.errors import FileProcessingFailedError
4+
from file_processing.file_processor_strategy import FileProcessorStrategy
5+
6+
class HFileProcessor(FileProcessorStrategy):
7+
"""
8+
Processor for handling C/C++ header files (.h), extracting metadata and content.
9+
10+
Attributes:
11+
metadata (dict): Contains metadata such as:
12+
'text' (str): Full file content as string.
13+
'encoding' (str): Detected file encoding.
14+
'num_lines' (int): Total number of lines.
15+
'num_includes' (int): Count of preprocessor #include statements.
16+
'num_macros' (int): Count of #define statements.
17+
'num_structs' (int): Count of struct definitions.
18+
'num_classes' (int): Count of class definitions (common in C++ headers).
19+
'num_comments' (int): Count of single-line and multi-line comments.
20+
"""
21+
22+
def __init__(self, file_path: str, open_file: bool = True) -> None:
23+
super().__init__(file_path, open_file)
24+
self.metadata = {'message': 'File was not opened'} if not open_file else {}
25+
26+
def process(self) -> None:
27+
if not self.open_file:
28+
return
29+
try:
30+
raw_data = open(self.file_path, 'rb').read()
31+
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
32+
33+
with open(self.file_path, 'r', encoding=encoding, errors='replace') as f:
34+
text = f.read()
35+
36+
num_lines = len(text.splitlines())
37+
38+
# Regex patterns:
39+
include_pattern = re.compile(r'^\s*#\s*include\s+["<].*[">]', re.MULTILINE)
40+
macro_pattern = re.compile(r'^\s*#\s*define\s+\w+', re.MULTILINE)
41+
struct_pattern = re.compile(r'\bstruct\s+\w+', re.MULTILINE)
42+
class_pattern = re.compile(r'\bclass\s+\w+', re.MULTILINE)
43+
comment_pattern = re.compile(r'(//[^\n]*|/\*.*?\*/)', re.DOTALL)
44+
45+
num_includes = len(include_pattern.findall(text))
46+
num_macros = len(macro_pattern.findall(text))
47+
num_structs = len(struct_pattern.findall(text))
48+
num_classes = len(class_pattern.findall(text))
49+
num_comments = len(comment_pattern.findall(text))
50+
51+
self.metadata.update({
52+
'text': text,
53+
'encoding': encoding,
54+
'num_lines': num_lines,
55+
'num_includes': num_includes,
56+
'num_macros': num_macros,
57+
'num_structs': num_structs,
58+
'num_classes': num_classes,
59+
'num_comments': num_comments,
60+
})
61+
62+
except Exception as e:
63+
raise FileProcessingFailedError(
64+
f"Error processing {self.file_path}: {e}"
65+
)
66+
67+
def save(self, output_path: str = None) -> None:
68+
try:
69+
save_path = output_path or self.file_path
70+
with open(save_path, 'w', encoding=self.metadata['encoding']) as f:
71+
f.write(self.metadata['text'])
72+
except Exception as e:
73+
raise FileProcessingFailedError(
74+
f"Error saving {self.file_path} to {save_path}: {e}"
75+
)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import chardet
2+
import re
3+
from file_processing.errors import FileProcessingFailedError
4+
from file_processing.file_processor_strategy import FileProcessorStrategy
5+
6+
class JsFileProcessor(FileProcessorStrategy):
7+
"""
8+
Processor for handling JavaScript (.js) source files, extracting metadata and content.
9+
10+
Attributes:
11+
metadata (dict): Contains metadata such as 'text', 'encoding', 'num_lines',
12+
'num_functions', 'num_classes', and 'num_comments'.
13+
"""
14+
15+
def __init__(self, file_path: str, open_file: bool = True) -> None:
16+
super().__init__(file_path, open_file)
17+
self.metadata = {'message': 'File was not opened'} if not open_file else {}
18+
19+
def process(self) -> None:
20+
"""Extracts JavaScript file metadata if open_file is True."""
21+
if not self.open_file:
22+
return
23+
try:
24+
raw_data = open(self.file_path, 'rb').read()
25+
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
26+
27+
# Read using the detected (or fallback) encoding
28+
with open(self.file_path, 'r', encoding=encoding, errors='replace') as f:
29+
text = f.read()
30+
31+
num_lines = len(text.splitlines())
32+
33+
# Basic regex patterns:
34+
# Matches 'function foo(...) { ... }' or 'export function foo(...) { ... }'
35+
function_pattern = re.compile(r'\b(function|export\s+function)\s+[A-Za-z_]\w*\s*\(.*?\)\s*\{')
36+
37+
# Matches 'class SomeClass { ... }'
38+
class_pattern = re.compile(r'\bclass\s+[A-Za-z_]\w*\s*\{?')
39+
40+
# Matches single-line (// ...) and multi-line (/* ... */) comments
41+
comment_pattern = re.compile(r'(//[^\n]*|/\*.*?\*/)', re.DOTALL)
42+
43+
num_functions = len(function_pattern.findall(text))
44+
num_classes = len(class_pattern.findall(text))
45+
num_comments = len(comment_pattern.findall(text))
46+
47+
self.metadata.update({
48+
'text': text,
49+
'encoding': encoding,
50+
'num_lines': num_lines,
51+
'num_functions': num_functions,
52+
'num_classes': num_classes,
53+
'num_comments': num_comments
54+
})
55+
56+
except Exception as e:
57+
raise FileProcessingFailedError(
58+
f"Error processing {self.file_path}: {e}"
59+
)
60+
61+
def save(self, output_path: str = None) -> None:
62+
"""Saves the JavaScript file to the specified path."""
63+
try:
64+
save_path = output_path or self.file_path
65+
with open(save_path, 'w', encoding=self.metadata['encoding']) as f:
66+
f.write(self.metadata['text'])
67+
except Exception as e:
68+
raise FileProcessingFailedError(
69+
f"Error saving {self.file_path} to {save_path}: {e}"
70+
)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import chardet
2+
import re
3+
from file_processing.errors import FileProcessingFailedError
4+
from file_processing.file_processor_strategy import FileProcessorStrategy
5+
6+
class RbFileProcessor(FileProcessorStrategy):
7+
"""
8+
Processor for handling Ruby (.rb) source files, extracting metadata and content.
9+
10+
Attributes:
11+
metadata (dict): Contains metadata such as 'text', 'encoding', 'num_lines',
12+
'num_methods', 'num_classes', and 'num_modules'.
13+
"""
14+
15+
def __init__(self, file_path: str, open_file: bool = True) -> None:
16+
super().__init__(file_path, open_file)
17+
self.metadata = {'message': 'File was not opened'} if not open_file else {}
18+
19+
def process(self) -> None:
20+
if not self.open_file:
21+
return
22+
try:
23+
raw_data = open(self.file_path, 'rb').read()
24+
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
25+
26+
with open(self.file_path, 'r', encoding=encoding) as f:
27+
text = f.read()
28+
29+
num_lines = len(text.splitlines())
30+
31+
num_methods = len(re.findall(r'^\s*def\s+\w+', text, re.MULTILINE))
32+
num_classes = len(re.findall(r'^\s*class\s+\w+', text, re.MULTILINE))
33+
num_modules = len(re.findall(r'^\s*module\s+\w+', text, re.MULTILINE))
34+
35+
self.metadata.update({
36+
'text': text,
37+
'encoding': encoding,
38+
'num_lines': num_lines,
39+
'num_methods': num_methods,
40+
'num_classes': num_classes,
41+
'num_modules': num_modules
42+
})
43+
44+
except Exception as e:
45+
raise FileProcessingFailedError(
46+
f"Error processing {self.file_path}: {e}"
47+
)
48+
49+
def save(self, output_path: str = None) -> None:
50+
try:
51+
save_path = output_path or self.file_path
52+
with open(save_path, 'w', encoding=self.metadata['encoding']) as f:
53+
f.write(self.metadata['text'])
54+
except Exception as e:
55+
raise FileProcessingFailedError(
56+
f"Error saving {self.file_path} to {save_path}: {e}"
57+
)

0 commit comments

Comments
 (0)