Skip to content

Commit 6549839

Browse files
committed
Added manual trigger for data loading pipeline
1 parent b1854f0 commit 6549839

File tree

7 files changed

+380
-32
lines changed

7 files changed

+380
-32
lines changed

scripts/python/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
selenium==4.15.2

scripts/python/web_scraper.py

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ def __init__(self, download_directory=None, headless=True):
5050

5151
# Create target directory if it doesn't exist
5252
os.makedirs(self.target_directory, exist_ok=True)
53+
print(f"Script directory: {script_dir}")
54+
print(f"Project root: {project_root}")
5355
print(f"Target directory for processed files: {self.target_directory}")
5456

5557
# Convert to absolute path and ensure it exists
@@ -59,33 +61,37 @@ def __init__(self, download_directory=None, headless=True):
5961

6062
chrome_options = Options()
6163
if headless:
62-
chrome_options.add_argument('--headless')
64+
chrome_options.add_argument('--headless=new') # Use new headless mode
6365

6466
chrome_options.add_argument('--no-sandbox')
6567
chrome_options.add_argument('--disable-dev-shm-usage')
6668
chrome_options.add_argument('--disable-gpu')
67-
chrome_options.add_argument('--remote-debugging-port=9222')
6869
chrome_options.add_argument('--window-size=1920,1200')
69-
chrome_options.add_argument('--start-maximized')
7070

71-
# Additional options to prevent permission issues
71+
# Remove problematic flags that can cause crashes
72+
# REMOVED: --disable-javascript (WITS requires JavaScript!)
73+
# REMOVED: --disable-images (can cause issues)
74+
# REMOVED: --disable-plugins
75+
# REMOVED: --single-process (can cause instability)
76+
77+
# Keep essential flags
7278
chrome_options.add_argument('--disable-extensions')
73-
chrome_options.add_argument('--disable-plugins')
74-
chrome_options.add_argument('--disable-images')
75-
chrome_options.add_argument('--disable-javascript')
76-
chrome_options.add_argument('--single-process')
7779
chrome_options.add_argument('--disable-background-networking')
7880
chrome_options.add_argument('--disable-default-apps')
7981
chrome_options.add_argument('--disable-sync')
8082

8183
# Add user-data-dir to avoid permission issues
8284
chrome_options.add_argument('--user-data-dir=/tmp/chrome-user-data')
8385

84-
# For Docker environments
86+
# For Docker/server environments
8587
chrome_options.add_argument('--disable-background-timer-throttling')
8688
chrome_options.add_argument('--disable-backgrounding-occluded-windows')
8789
chrome_options.add_argument('--disable-renderer-backgrounding')
8890

91+
# Disable automation detection
92+
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
93+
chrome_options.add_experimental_option('useAutomationExtension', False)
94+
8995
# Configure downloads with absolute path
9096
prefs = {
9197
"download.default_directory": self.download_directory,
@@ -99,10 +105,32 @@ def __init__(self, download_directory=None, headless=True):
99105

100106
# Initialize service and driver with error handling
101107
try:
102-
self.service = QuietService(executable_path="chromedriver")
108+
# First, try to find chromedriver in the script's directory
109+
script_dir = os.path.dirname(os.path.abspath(__file__))
110+
local_chromedriver = os.path.join(script_dir, 'chromedriver')
111+
112+
# Check if chromedriver exists locally
113+
if os.path.exists(local_chromedriver):
114+
print(f"Using local chromedriver: {local_chromedriver}")
115+
# Make sure it's executable
116+
os.chmod(local_chromedriver, 0o755)
117+
self.service = QuietService(executable_path=local_chromedriver)
118+
else:
119+
# Fall back to system chromedriver
120+
print(f"Local chromedriver not found at: {local_chromedriver}")
121+
print("Trying system chromedriver...")
122+
self.service = QuietService(executable_path="chromedriver")
123+
103124
self.driver = webdriver.Chrome(service=self.service, options=chrome_options)
125+
print("Chrome driver initialized successfully")
126+
104127
except Exception as e:
105128
print(f"Error initializing Chrome driver: {e}")
129+
print("\nTroubleshooting:")
130+
print(f"1. Place chromedriver in: {script_dir}")
131+
print(f"2. Make it executable: chmod +x {os.path.join(script_dir, 'chromedriver')}")
132+
print("3. Or install system-wide: sudo apt install chromium-chromedriver")
133+
print("4. Or install webdriver-manager: pip3 install webdriver-manager")
106134
raise
107135

108136
self.base_url = "https://wits.worldbank.org/WITS/WITS/QuickQuery/FindTariff/FindTariff.aspx?Page=FindATariff"
@@ -779,6 +807,8 @@ def get_csv_file_from_zip(self):
779807
# Move CSV to Spring Boot resources directory instead of download directory
780808
csv_filename = os.path.basename(csv_path)
781809
final_csv_path = os.path.join(self.target_directory, csv_filename)
810+
print(f"Moving CSV from: {csv_path}")
811+
print(f"Moving CSV to: {final_csv_path}")
782812
shutil.move(csv_path, final_csv_path)
783813
print(f"Moved CSV file to: {final_csv_path}")
784814

@@ -797,7 +827,7 @@ def rename_csv_file(self, csv_path, country_code, year):
797827
"""
798828
try:
799829
# Create new filename
800-
new_filename = f"HS2017{country_code}{year}.csv"
830+
new_filename = f"HS2017{country_code}Year{year}.csv"
801831
new_path = os.path.join(os.path.dirname(csv_path), new_filename)
802832

803833
# Rename the file
@@ -932,10 +962,10 @@ def main():
932962

933963
# Map country code to market name (expand this mapping as needed)
934964
country_mapping = {
935-
'USA': 'United States',
936-
'CHN': 'China',
965+
'USA': 'United States', #Pass
966+
'CHN': 'China',
937967
'JPN': 'Japan',
938-
'DEU': 'Germany',
968+
'DEU': 'Germany', #Fail
939969
'IND': 'India',
940970
'GBR': 'United Kingdom',
941971
'FRA': 'France',
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
package com.ubs.tariffapp;
2+
3+
import org.springframework.boot.SpringApplication;
4+
import org.springframework.boot.autoconfigure.SpringBootApplication;
5+
import org.springframework.context.ConfigurableApplicationContext;
6+
7+
import com.ubs.tariffapp.services.PythonScraperService;
8+
9+
/**
10+
* Command-line runner for the scraper→cleaner→loader pipeline
11+
*
12+
* Usage:
13+
* mvn exec:java -Dexec.mainClass="com.ubs.tariffapp.PipelineRunner" -Dexec.args="USA,CHN,SGP 2023"
14+
*
15+
* Arguments:
16+
* - Country codes (comma-separated, e.g., USA,CHN,SGP,JPN)
17+
* - Year (e.g., 2023)
18+
*/
19+
@SpringBootApplication
20+
public class PipelineRunner {
21+
22+
public static void main(String[] args) {
23+
if (args.length < 2) {
24+
System.err.println("Usage: PipelineRunner <country_codes> <year>");
25+
System.err.println("Example: PipelineRunner USA,CHN,SGP 2023");
26+
System.err.println("Example: PipelineRunner USA 2023");
27+
System.err.println("\nSupported country codes: USA, CHN, SGP, JPN, DEU, GBR, FRA, etc.");
28+
System.exit(1);
29+
}
30+
31+
String[] countryCodes = args[0].split(",");
32+
String year = args[1];
33+
34+
System.out.println("=".repeat(60));
35+
System.out.println("Starting Tariff Data Pipeline");
36+
System.out.println("Countries: " + String.join(", ", countryCodes));
37+
System.out.println("Year: " + year);
38+
System.out.println("=".repeat(60));
39+
40+
ConfigurableApplicationContext context = null;
41+
int successCount = 0;
42+
int failCount = 0;
43+
44+
try {
45+
context = SpringApplication.run(PipelineRunner.class, args);
46+
PythonScraperService scraperService = context.getBean(PythonScraperService.class);
47+
48+
for (String countryCode : countryCodes) {
49+
String country = countryCode.trim().toUpperCase();
50+
51+
System.out.println("\n" + "-".repeat(60));
52+
System.out.println("Processing: " + country);
53+
System.out.println("-".repeat(60));
54+
55+
try {
56+
boolean success = scraperService.scrapeAndProcessCountryData(country);
57+
58+
if (success) {
59+
successCount++;
60+
System.out.println("✓ " + country + " completed successfully");
61+
} else {
62+
failCount++;
63+
System.err.println("✗ " + country + " failed");
64+
}
65+
} catch (Exception e) {
66+
failCount++;
67+
System.err.println("✗ " + country + " failed with error: " + e.getMessage());
68+
}
69+
}
70+
71+
System.out.println("\n" + "=".repeat(60));
72+
System.out.println("Pipeline Summary:");
73+
System.out.println(" Total: " + countryCodes.length);
74+
System.out.println(" Success: " + successCount);
75+
System.out.println(" Failed: " + failCount);
76+
System.out.println("=".repeat(60));
77+
78+
System.exit(failCount == 0 ? 0 : 1);
79+
80+
} catch (Exception e) {
81+
System.err.println("\nFatal error running pipeline: " + e.getMessage());
82+
e.printStackTrace();
83+
System.exit(1);
84+
} finally {
85+
if (context != null) {
86+
context.close();
87+
}
88+
}
89+
}
90+
}

src/main/java/com/ubs/tariffapp/services/PythonScraperService.java

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -97,18 +97,29 @@ public boolean scrapeAndProcessCountryData(String countryCode) {
9797
*/
9898
private String runWitsDataScraper(String countryCode, String mostRecentYear) {
9999
try {
100-
// Build command with country code and year as arguments
100+
// Get the absolute path to the script
101+
Path scriptPath = Paths.get(pythonScriptPath).toAbsolutePath();
102+
103+
if (!Files.exists(scriptPath)) {
104+
logger.error("Python script not found at: {}", scriptPath);
105+
return null;
106+
}
107+
108+
logger.info("Using Python script: {}", scriptPath);
109+
110+
// Build command with absolute script path
111+
// Add -u flag to run Python in unbuffered mode for real-time output
101112
ProcessBuilder processBuilder = new ProcessBuilder(
102-
pythonExecutable,
103-
pythonScriptPath,
113+
pythonExecutable,
114+
"-u", // Unbuffered mode - see output in real-time!
115+
scriptPath.toString(),
104116
countryCode,
105117
mostRecentYear,
106118
"--headless"
107119
);
108120

109121
Map<String, String> env = processBuilder.environment();
110122

111-
// Use the configured credentials from application properties
112123
if (witsUsername != null && !witsUsername.isEmpty()) {
113124
env.put("WITS_USERNAME", witsUsername);
114125
logger.debug("WITS_USERNAME environment variable set for scraper from properties");
@@ -130,12 +141,7 @@ private String runWitsDataScraper(String countryCode, String mostRecentYear) {
130141
env.put("WITS_API_KEY", witsApiKey);
131142
}
132143

133-
// Set working directory to script directory
134-
Path scriptDir = Paths.get(pythonScriptPath).getParent();
135-
if (scriptDir != null && Files.exists(scriptDir)) {
136-
processBuilder.directory(scriptDir.toFile());
137-
logger.debug("Working directory set to: {}", scriptDir);
138-
}
144+
// Don't set working directory - let Python script handle paths
139145

140146
processBuilder.redirectErrorStream(true);
141147
Process process = processBuilder.start();
@@ -145,9 +151,11 @@ private String runWitsDataScraper(String countryCode, String mostRecentYear) {
145151
String line;
146152
while ((line = reader.readLine()) != null) {
147153
output.append(line).append("\n");
148-
// Don't log credentials that might appear in output
154+
// Log ALL output in real-time (including password-safe lines)
149155
if (!line.toLowerCase().contains("password") && !line.toLowerCase().contains("credential")) {
150-
logger.debug("Web scraper output: {}", line);
156+
// Log to console AND logger
157+
System.out.println("Python: " + line); // Console output
158+
logger.info("Python: {}", line); // Logger output
151159
}
152160
}
153161
}
@@ -204,6 +212,16 @@ private boolean processScrapedFile(String fileName, String countryCode) {
204212
String[] args = {fileName};
205213
HSDataCleaner.main(args);
206214
logger.info("HSDataCleaner processing completed for file: {}", fileName);
215+
216+
// Delete the original scraped file after cleaning
217+
Path originalFile = Paths.get("src/main/resources/data/test_data").resolve(fileName);
218+
try {
219+
Files.delete(originalFile);
220+
logger.info("Deleted original scraped file: {}", fileName);
221+
} catch (Exception e) {
222+
logger.warn("Could not delete original file {}: {}", fileName, e.getMessage());
223+
}
224+
207225
} catch (Exception e) {
208226
logger.error("Error running HSDataCleaner: {}", e.getMessage(), e);
209227
return false;

src/main/java/com/ubs/tariffapp/utils/HSDataCleaner.java

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
import java.io.IOException;
77
import java.io.InputStream;
88
import java.io.InputStreamReader;
9-
import java.math.BigDecimal;
10-
import java.math.RoundingMode;
9+
import java.nio.file.Files;
10+
import java.nio.file.Path;
11+
import java.nio.file.Paths;
1112
import java.util.Arrays;
1213
import java.util.HashMap;
1314
import java.util.HashSet;
@@ -130,13 +131,21 @@ public static void main(String[] args) {
130131
System.out.println("No file argument provided, using default: " + inputFileName);
131132
}
132133

133-
// Read input from resources
134-
InputStream inputStream = HSDataCleaner.class.getResourceAsStream("/data/test_data/" + inputFileName);
135-
if (inputStream == null) {
134+
// Read input from file system instead of classpath resources
135+
Path inputPath = Paths.get("src/main/resources/data/test_data/" + inputFileName);
136+
if (!Files.exists(inputPath)) {
136137
System.err.println("Input CSV file not found in resources folder: " + inputFileName);
137138
return;
138139
}
139140

141+
InputStream inputStream;
142+
try {
143+
inputStream = Files.newInputStream(inputPath);
144+
} catch (IOException e) {
145+
System.err.println("Error opening input file: " + e.getMessage());
146+
return;
147+
}
148+
140149
String outputFileName = "clean_" + inputFileName;
141150
String outputFile = "src/main/resources/data/clean_data/" + outputFileName;
142151

@@ -295,6 +304,16 @@ public static void main(String[] args) {
295304
System.out.println("Output saved to " + outputFile);
296305
System.out.println("Added columns: Industry, DutyType, StandardizedAVRate, SpecificDutyAmount, Currency, Unit, OriginalSpecificDuty");
297306

307+
System.out.println("Data cleaning completed successfully.");
308+
309+
// Delete the original file from test_data
310+
try {
311+
Files.delete(inputPath);
312+
System.out.println("Deleted original file: " + inputPath);
313+
} catch (Exception e) {
314+
System.err.println("Warning: Could not delete original file: " + e.getMessage());
315+
}
316+
298317
} catch (IOException e) {
299318
e.printStackTrace();
300319
return;

0 commit comments

Comments
 (0)