-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup.py
More file actions
243 lines (199 loc) · 8.23 KB
/
setup.py
File metadata and controls
243 lines (199 loc) · 8.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/usr/bin/env python3
"""
Setup script for Big Data Text Analysis Project
Handles installation, configuration, and validation
"""
import os
import sys
import subprocess
import logging
from pathlib import Path
def setup_logging():
"""Setup basic logging"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
return logging.getLogger(__name__)
def check_python_version():
"""Check if Python version is compatible"""
if sys.version_info < (3, 8):
print("❌ Python 3.8 or higher is required")
sys.exit(1)
print(f"✅ Python {sys.version_info.major}.{sys.version_info.minor} detected")
def create_virtual_environment():
"""Create virtual environment if it doesn't exist"""
venv_path = Path("myenv")
if venv_path.exists():
print("✅ Virtual environment already exists")
return True
print("📦 Creating virtual environment...")
try:
subprocess.run([sys.executable, "-m", "venv", "myenv"], check=True)
print("✅ Virtual environment created successfully")
return True
except subprocess.CalledProcessError:
print("❌ Failed to create virtual environment")
return False
def install_dependencies():
"""Install required dependencies"""
print("📦 Installing dependencies...")
# Determine the pip command based on OS
if os.name == 'nt': # Windows
pip_cmd = "myenv\\Scripts\\pip"
python_cmd = "myenv\\Scripts\\python"
else: # Unix/Linux/Mac
pip_cmd = "myenv/bin/pip"
python_cmd = "myenv/bin/python"
try:
# Try to upgrade pip, but don't fail if it doesn't work
try:
print("📦 Upgrading pip...")
subprocess.run([pip_cmd, "install", "--upgrade", "pip"],
check=True, capture_output=True, text=True)
print("✅ Pip upgraded successfully")
except subprocess.CalledProcessError as e:
print("⚠️ Pip upgrade failed, continuing with current version...")
# On Windows, try the alternative upgrade method
if os.name == 'nt':
try:
subprocess.run([python_cmd, "-m", "pip", "install", "--upgrade", "pip"],
check=True, capture_output=True, text=True)
print("✅ Pip upgraded using alternative method")
except subprocess.CalledProcessError:
print("⚠️ Pip upgrade failed, using current version")
# Install requirements
if Path("requirements.txt").exists():
print("📦 Installing packages from requirements.txt...")
subprocess.run([pip_cmd, "install", "-r", "requirements.txt"], check=True)
print("✅ Dependencies installed successfully")
else:
print("⚠️ requirements.txt not found, installing basic packages...")
basic_packages = [
"scikit-learn>=1.3.0",
"spacy>=3.6.0",
"numpy>=1.24.0",
"pandas>=2.0.0"
]
for package in basic_packages:
print(f"📦 Installing {package}...")
subprocess.run([pip_cmd, "install", package], check=True)
print("✅ Basic packages installed")
# Install spaCy model
print("📦 Installing spaCy English model...")
try:
subprocess.run([python_cmd, "-m", "spacy", "download", "en_core_web_sm"],
check=True, capture_output=True, text=True)
print("✅ spaCy model installed")
except subprocess.CalledProcessError as e:
print("⚠️ spaCy model installation failed, trying alternative method...")
# Try installing spacy first if it's not already installed
subprocess.run([pip_cmd, "install", "spacy"], check=True)
subprocess.run([python_cmd, "-m", "spacy", "download", "en_core_web_sm"],
check=True, capture_output=True, text=True)
print("✅ spaCy model installed with alternative method")
return True
except subprocess.CalledProcessError as e:
print(f"❌ Failed to install dependencies: {e}")
return False
def create_directories():
"""Create necessary directories"""
directories = ["input", "output", "results"]
print("📁 Creating directories...")
for directory in directories:
Path(directory).mkdir(exist_ok=True)
print(f"✅ Created directory: {directory}")
def run_tests():
"""Run the test suite"""
print("🧪 Running tests...")
if os.name == 'nt': # Windows
python_cmd = "myenv\\Scripts\\python"
else: # Unix/Linux/Mac
python_cmd = "myenv/bin/python"
try:
result = subprocess.run([python_cmd, "test_analysis.py"], capture_output=True, text=True)
if result.returncode == 0:
print("✅ All tests passed!")
print(result.stdout)
else:
print("⚠️ Some tests failed:")
print(result.stderr)
return result.returncode == 0
except subprocess.CalledProcessError:
print("❌ Failed to run tests")
return False
def create_sample_data():
"""Create sample input files for testing"""
sample_text = """
This is a sample text for testing the EKU student handbook analysis pipeline.
The university provides various academic services to students.
Students must follow the academic policies and conduct guidelines.
The campus offers many resources for student success.
"""
sample_file = Path("input/sample_input.txt")
sample_file.parent.mkdir(exist_ok=True)
with open(sample_file, 'w') as f:
f.write(sample_text)
print(f"✅ Created sample input file: {sample_file}")
def print_usage_instructions():
"""Print usage instructions"""
print("\n" + "="*60)
print("🎉 SETUP COMPLETE!")
print("="*60)
print("\n📋 Usage Instructions:")
print("\n1. Activate virtual environment:")
if os.name == 'nt': # Windows
print(" myenv\\Scripts\\activate")
else: # Unix/Linux/Mac
print(" source myenv/bin/activate")
print("\n2. Run the full pipeline:")
print(" ./run_pipeline.sh")
print("\n3. Run individual analyses:")
print(" # Word count")
print(" cat input/sample_input.txt | python preprocess.py | python mapper.py | python reducer.py")
print("\n # TF-IDF")
print(" cat input/sample_input.txt | python preprocess_tfidf.py | python mapper_tfidf.py doc1 | python reducer_tfidf.py")
print("\n # LDA")
print(" cat input/sample_input.txt | python mapper_lda.py | python reducer_lda.py")
print("\n4. Run tests:")
print(" python test_analysis.py")
print("\n📁 Project structure:")
print(" - input/ : Place your input files here")
print(" - output/ : Intermediate outputs")
print(" - results/ : Final analysis results")
print("\n🔧 Configuration:")
print(" - Edit config.py to customize parameters")
print(" - Modify utils.py for custom preprocessing")
print("\n📚 Documentation:")
print(" - README.md contains detailed usage instructions")
print(" - All Python files include docstrings and type hints")
print("\n" + "="*60)
def main():
"""Main setup function"""
logger = setup_logging()
print("🚀 Big Data Text Analysis Project Setup")
print("="*50)
# Check Python version
check_python_version()
# Create virtual environment
if not create_virtual_environment():
sys.exit(1)
# Install dependencies
if not install_dependencies():
sys.exit(1)
# Create directories
create_directories()
# Create sample data
create_sample_data()
# Run tests
tests_passed = run_tests()
# Print instructions
print_usage_instructions()
if tests_passed:
print("✅ Setup completed successfully!")
sys.exit(0)
else:
print("⚠️ Setup completed with test warnings")
sys.exit(0)
if __name__ == "__main__":
main()