Added manual trigger for data loading pipeline

OborosYX · OborosYX · commit 65498397eb05 · 2025-11-10T18:09:01.000+08:00
diff --git a/scripts/python/requirements.txt b/scripts/python/requirements.txt
@@ -0,0 +1,2 @@
+
+selenium==4.15.2
diff --git a/scripts/python/web_scraper.py b/scripts/python/web_scraper.py
@@ -50,6 +50,8 @@ def __init__(self, download_directory=None, headless=True):
         
         # Create target directory if it doesn't exist
         os.makedirs(self.target_directory, exist_ok=True)
+        print(f"Script directory: {script_dir}")
+        print(f"Project root: {project_root}")
         print(f"Target directory for processed files: {self.target_directory}")
             
         # Convert to absolute path and ensure it exists
@@ -59,33 +61,37 @@ def __init__(self, download_directory=None, headless=True):
         
         chrome_options = Options()
         if headless:
-            chrome_options.add_argument('--headless')
+            chrome_options.add_argument('--headless=new')  # Use new headless mode
             
         chrome_options.add_argument('--no-sandbox')
         chrome_options.add_argument('--disable-dev-shm-usage')
         chrome_options.add_argument('--disable-gpu')
-        chrome_options.add_argument('--remote-debugging-port=9222')
         chrome_options.add_argument('--window-size=1920,1200')
-        chrome_options.add_argument('--start-maximized')
         
-        # Additional options to prevent permission issues
+        # Remove problematic flags that can cause crashes
+        # REMOVED: --disable-javascript (WITS requires JavaScript!)
+        # REMOVED: --disable-images (can cause issues)
+        # REMOVED: --disable-plugins
+        # REMOVED: --single-process (can cause instability)
+        
+        # Keep essential flags
         chrome_options.add_argument('--disable-extensions')
-        chrome_options.add_argument('--disable-plugins')
-        chrome_options.add_argument('--disable-images')
-        chrome_options.add_argument('--disable-javascript')
-        chrome_options.add_argument('--single-process')
         chrome_options.add_argument('--disable-background-networking')
         chrome_options.add_argument('--disable-default-apps')
         chrome_options.add_argument('--disable-sync')
         
         # Add user-data-dir to avoid permission issues
         chrome_options.add_argument('--user-data-dir=/tmp/chrome-user-data')
         
-        # For Docker environments
+        # For Docker/server environments
         chrome_options.add_argument('--disable-background-timer-throttling')
         chrome_options.add_argument('--disable-backgrounding-occluded-windows')
         chrome_options.add_argument('--disable-renderer-backgrounding')
         
+        # Disable automation detection
+        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option('useAutomationExtension', False)
+        
         # Configure downloads with absolute path
         prefs = {
             "download.default_directory": self.download_directory,
@@ -99,10 +105,32 @@ def __init__(self, download_directory=None, headless=True):
         
         # Initialize service and driver with error handling
         try:
-            self.service = QuietService(executable_path="chromedriver")
+            # First, try to find chromedriver in the script's directory
+            script_dir = os.path.dirname(os.path.abspath(__file__))
+            local_chromedriver = os.path.join(script_dir, 'chromedriver')
+            
+            # Check if chromedriver exists locally
+            if os.path.exists(local_chromedriver):
+                print(f"Using local chromedriver: {local_chromedriver}")
+                # Make sure it's executable
+                os.chmod(local_chromedriver, 0o755)
+                self.service = QuietService(executable_path=local_chromedriver)
+            else:
+                # Fall back to system chromedriver
+                print(f"Local chromedriver not found at: {local_chromedriver}")
+                print("Trying system chromedriver...")
+                self.service = QuietService(executable_path="chromedriver")
+            
             self.driver = webdriver.Chrome(service=self.service, options=chrome_options)
+            print("Chrome driver initialized successfully")
+            
         except Exception as e:
             print(f"Error initializing Chrome driver: {e}")
+            print("\nTroubleshooting:")
+            print(f"1. Place chromedriver in: {script_dir}")
+            print(f"2. Make it executable: chmod +x {os.path.join(script_dir, 'chromedriver')}")
+            print("3. Or install system-wide: sudo apt install chromium-chromedriver")
+            print("4. Or install webdriver-manager: pip3 install webdriver-manager")
             raise
     
         self.base_url = "https://wits.worldbank.org/WITS/WITS/QuickQuery/FindTariff/FindTariff.aspx?Page=FindATariff"
@@ -779,6 +807,8 @@ def get_csv_file_from_zip(self):
         # Move CSV to Spring Boot resources directory instead of download directory
         csv_filename = os.path.basename(csv_path)
         final_csv_path = os.path.join(self.target_directory, csv_filename)
+        print(f"Moving CSV from: {csv_path}")
+        print(f"Moving CSV to: {final_csv_path}")
         shutil.move(csv_path, final_csv_path)
         print(f"Moved CSV file to: {final_csv_path}")
         
@@ -797,7 +827,7 @@ def rename_csv_file(self, csv_path, country_code, year):
         """
         try:
             # Create new filename
-            new_filename = f"HS2017{country_code}{year}.csv"
+            new_filename = f"HS2017{country_code}Year{year}.csv"
             new_path = os.path.join(os.path.dirname(csv_path), new_filename)
             
             # Rename the file
@@ -932,10 +962,10 @@ def main():
     
     # Map country code to market name (expand this mapping as needed)
     country_mapping = {
-        'USA': 'United States',
-        'CHN': 'China',
+        'USA': 'United States', #Pass
+        'CHN': 'China', 
         'JPN': 'Japan',
-        'DEU': 'Germany',
+        'DEU': 'Germany', #Fail
         'IND': 'India',
         'GBR': 'United Kingdom',
         'FRA': 'France',
diff --git a/src/main/java/com/ubs/tariffapp/PipelineRunner.java b/src/main/java/com/ubs/tariffapp/PipelineRunner.java
@@ -0,0 +1,90 @@
+package com.ubs.tariffapp;
+
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+import org.springframework.context.ConfigurableApplicationContext;
+
+import com.ubs.tariffapp.services.PythonScraperService;
+
+/**
+ * Command-line runner for the scraper→cleaner→loader pipeline
+ * 
+ * Usage:
+ * mvn exec:java -Dexec.mainClass="com.ubs.tariffapp.PipelineRunner" -Dexec.args="USA,CHN,SGP 2023"
+ * 
+ * Arguments:
+ * - Country codes (comma-separated, e.g., USA,CHN,SGP,JPN)
+ * - Year (e.g., 2023)
+ */
+@SpringBootApplication
+public class PipelineRunner {
+    
+    public static void main(String[] args) {
+        if (args.length < 2) {
+            System.err.println("Usage: PipelineRunner <country_codes> <year>");
+            System.err.println("Example: PipelineRunner USA,CHN,SGP 2023");
+            System.err.println("Example: PipelineRunner USA 2023");
+            System.err.println("\nSupported country codes: USA, CHN, SGP, JPN, DEU, GBR, FRA, etc.");
+            System.exit(1);
+        }
+        
+        String[] countryCodes = args[0].split(",");
+        String year = args[1];
+        
+        System.out.println("=".repeat(60));
+        System.out.println("Starting Tariff Data Pipeline");
+        System.out.println("Countries: " + String.join(", ", countryCodes));
+        System.out.println("Year: " + year);
+        System.out.println("=".repeat(60));
+        
+        ConfigurableApplicationContext context = null;
+        int successCount = 0;
+        int failCount = 0;
+        
+        try {
+            context = SpringApplication.run(PipelineRunner.class, args);
+            PythonScraperService scraperService = context.getBean(PythonScraperService.class);
+            
+            for (String countryCode : countryCodes) {
+                String country = countryCode.trim().toUpperCase();
+                
+                System.out.println("\n" + "-".repeat(60));
+                System.out.println("Processing: " + country);
+                System.out.println("-".repeat(60));
+                
+                try {
+                    boolean success = scraperService.scrapeAndProcessCountryData(country);
+                    
+                    if (success) {
+                        successCount++;
+                        System.out.println("✓ " + country + " completed successfully");
+                    } else {
+                        failCount++;
+                        System.err.println("✗ " + country + " failed");
+                    }
+                } catch (Exception e) {
+                    failCount++;
+                    System.err.println("✗ " + country + " failed with error: " + e.getMessage());
+                }
+            }
+            
+            System.out.println("\n" + "=".repeat(60));
+            System.out.println("Pipeline Summary:");
+            System.out.println("  Total: " + countryCodes.length);
+            System.out.println("  Success: " + successCount);
+            System.out.println("  Failed: " + failCount);
+            System.out.println("=".repeat(60));
+            
+            System.exit(failCount == 0 ? 0 : 1);
+            
+        } catch (Exception e) {
+            System.err.println("\nFatal error running pipeline: " + e.getMessage());
+            e.printStackTrace();
+            System.exit(1);
+        } finally {
+            if (context != null) {
+                context.close();
+            }
+        }
+    }
+}
diff --git a/src/main/java/com/ubs/tariffapp/services/PythonScraperService.java b/src/main/java/com/ubs/tariffapp/services/PythonScraperService.java
@@ -97,18 +97,29 @@ public boolean scrapeAndProcessCountryData(String countryCode) {
      */
     private String runWitsDataScraper(String countryCode, String mostRecentYear) {
         try {
-            // Build command with country code and year as arguments
+            // Get the absolute path to the script
+            Path scriptPath = Paths.get(pythonScriptPath).toAbsolutePath();
+            
+            if (!Files.exists(scriptPath)) {
+                logger.error("Python script not found at: {}", scriptPath);
+                return null;
+            }
+            
+            logger.info("Using Python script: {}", scriptPath);
+            
+            // Build command with absolute script path
+            // Add -u flag to run Python in unbuffered mode for real-time output
             ProcessBuilder processBuilder = new ProcessBuilder(
-                pythonExecutable, 
-                pythonScriptPath, 
+                pythonExecutable,
+                "-u",  // Unbuffered mode - see output in real-time!
+                scriptPath.toString(),
                 countryCode, 
                 mostRecentYear,
                 "--headless"
             );
             
             Map<String, String> env = processBuilder.environment();
             
-            // Use the configured credentials from application properties
             if (witsUsername != null && !witsUsername.isEmpty()) {
                 env.put("WITS_USERNAME", witsUsername);
                 logger.debug("WITS_USERNAME environment variable set for scraper from properties");
@@ -130,12 +141,7 @@ private String runWitsDataScraper(String countryCode, String mostRecentYear) {
                 env.put("WITS_API_KEY", witsApiKey);
             }
             
-            // Set working directory to script directory
-            Path scriptDir = Paths.get(pythonScriptPath).getParent();
-            if (scriptDir != null && Files.exists(scriptDir)) {
-                processBuilder.directory(scriptDir.toFile());
-                logger.debug("Working directory set to: {}", scriptDir);
-            }
+            // Don't set working directory - let Python script handle paths
             
             processBuilder.redirectErrorStream(true);
             Process process = processBuilder.start();
@@ -145,9 +151,11 @@ private String runWitsDataScraper(String countryCode, String mostRecentYear) {
                 String line;
                 while ((line = reader.readLine()) != null) {
                     output.append(line).append("\n");
-                    // Don't log credentials that might appear in output
+                    // Log ALL output in real-time (including password-safe lines)
                     if (!line.toLowerCase().contains("password") && !line.toLowerCase().contains("credential")) {
-                        logger.debug("Web scraper output: {}", line);
+                        // Log to console AND logger
+                        System.out.println("Python: " + line);  // Console output
+                        logger.info("Python: {}", line);         // Logger output
                     }
                 }
             }
@@ -204,6 +212,16 @@ private boolean processScrapedFile(String fileName, String countryCode) {
                 String[] args = {fileName};
                 HSDataCleaner.main(args);
                 logger.info("HSDataCleaner processing completed for file: {}", fileName);
+                
+                // Delete the original scraped file after cleaning
+                Path originalFile = Paths.get("src/main/resources/data/test_data").resolve(fileName);
+                try {
+                    Files.delete(originalFile);
+                    logger.info("Deleted original scraped file: {}", fileName);
+                } catch (Exception e) {
+                    logger.warn("Could not delete original file {}: {}", fileName, e.getMessage());
+                }
+                
             } catch (Exception e) {
                 logger.error("Error running HSDataCleaner: {}", e.getMessage(), e);
                 return false;
diff --git a/src/main/java/com/ubs/tariffapp/utils/HSDataCleaner.java b/src/main/java/com/ubs/tariffapp/utils/HSDataCleaner.java
@@ -6,8 +6,9 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.math.BigDecimal;
-import java.math.RoundingMode;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -130,13 +131,21 @@ public static void main(String[] args) {
             System.out.println("No file argument provided, using default: " + inputFileName);
         }
 
-        // Read input from resources
-        InputStream inputStream = HSDataCleaner.class.getResourceAsStream("/data/test_data/" + inputFileName);
-        if (inputStream == null) {
+        // Read input from file system instead of classpath resources
+        Path inputPath = Paths.get("src/main/resources/data/test_data/" + inputFileName);
+        if (!Files.exists(inputPath)) {
             System.err.println("Input CSV file not found in resources folder: " + inputFileName);
             return;
         }
 
+        InputStream inputStream;
+        try {
+            inputStream = Files.newInputStream(inputPath);
+        } catch (IOException e) {
+            System.err.println("Error opening input file: " + e.getMessage());
+            return;
+        }
+
         String outputFileName = "clean_" + inputFileName;
         String outputFile = "src/main/resources/data/clean_data/" + outputFileName;
         
@@ -295,6 +304,16 @@ public static void main(String[] args) {
             System.out.println("Output saved to " + outputFile);
             System.out.println("Added columns: Industry, DutyType, StandardizedAVRate, SpecificDutyAmount, Currency, Unit, OriginalSpecificDuty");
 
+            System.out.println("Data cleaning completed successfully.");
+
+            // Delete the original file from test_data
+            try {
+                Files.delete(inputPath);
+                System.out.println("Deleted original file: " + inputPath);
+            } catch (Exception e) {
+                System.err.println("Warning: Could not delete original file: " + e.getMessage());
+            }
+
         } catch (IOException e) {
             e.printStackTrace();
             return;
diff --git a/src/main/java/com/ubs/tariffapp/utils/PipelineExecutor.java b/src/main/java/com/ubs/tariffapp/utils/PipelineExecutor.java
diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties