codeforamerica
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎app/models/document.rb‎
Lines changed: 15 additions & 2 deletions b/‎app/models/document.rb‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎app/models/site.rb‎
Lines changed: 36 additions & 5 deletions b/‎app/models/site.rb‎
Lines changed: 36 additions & 5 deletions
diff --git a/‎app/views/documents/index.html.erb‎
Lines changed: 12 additions & 0 deletions b/‎app/views/documents/index.html.erb‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎bin/crawl‎
Lines changed: 130 additions & 0 deletions b/‎bin/crawl‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎config/credentials/staging.yml.enc‎
Lines changed: 1 addition & 0 deletions b/‎config/credentials/staging.yml.enc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎db/migrate/20250912151310_remove_status_column.rb‎
Lines changed: 9 additions & 0 deletions b/‎db/migrate/20250912151310_remove_status_column.rb‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎db/migrate/20250912151347_add_last_crawl_date.rb‎
Lines changed: 11 additions & 0 deletions b/‎db/migrate/20250912151347_add_last_crawl_date.rb‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎db/seeds/site_documents_2025_09_25.zip‎
1.45 MB b/‎db/seeds/site_documents_2025_09_25.zip‎
1.45 MB
diff --git a/‎docs/windows_localdev.md‎
Lines changed: 4 additions & 2 deletions b/‎docs/windows_localdev.md‎
Lines changed: 4 additions & 2 deletions
@@ -157,3 +157,5 @@ python_components/*/*/.pytest_cache
 db/schema.rb
 
 cline_docs
+
+/config/credentials/staging.key
@@ -32,6 +32,11 @@ class Document < ApplicationRecord
 
   COMPLEXITIES = [SIMPLE_STATUS, COMPLEX_STATUS].freeze
 
+  DOCUMENT_STATUS_NEW = "New".freeze
+  DOCUMENT_STATUS_ACTIVE = "Active".freeze
+  DOCUMENT_STATUS_REMOVED = "Removed".freeze
+  DOCUMENT_STATUSES = [DOCUMENT_STATUS_NEW, DOCUMENT_STATUS_ACTIVE, DOCUMENT_STATUS_REMOVED].freeze
+
   belongs_to :site
 
   has_many :document_inferences
@@ -42,7 +47,7 @@ class Document < ApplicationRecord
 
   validates :file_name, presence: true
   validates :url, presence: true, format: {with: URI::DEFAULT_PARSER.make_regexp}
-  validates :document_status, presence: true, inclusion: {in: %w[discovered downloaded]}
+  validates :document_status, inclusion: {in: DOCUMENT_STATUSES, allow_blank: true, allow_nil: true}
   validates :document_category, inclusion: {in: CONTENT_TYPES}
   validates :accessibility_recommendation, inclusion: {in: -> { get_decision_types }}, presence: true
   validates :complexity, inclusion: {in: COMPLEXITIES}, allow_nil: true
@@ -284,6 +289,14 @@ def primary_source
     urls.is_a?(Array) ? urls.first : urls
   end
 
+  def get_crawl_status_display
+    if document_status == DOCUMENT_STATUS_NEW && last_crawl_date.present? && last_crawl_date.after?(1.week.ago)
+      DOCUMENT_STATUS_NEW
+    elsif document_status == DOCUMENT_STATUS_REMOVED
+      DOCUMENT_STATUS_REMOVED
+    end
+  end
+
   private
 
   def recursive_decode(url)
@@ -295,8 +308,8 @@ def recursive_decode(url)
   end
 
   def set_defaults
-    self.document_status = "discovered" unless document_status
     self.accessibility_recommendation = DEFAULT_DECISION unless accessibility_recommendation
+    self.document_status = DOCUMENT_STATUS_ACTIVE unless document_status.present?
   end
 
   def set_complexity
 
@@ -170,7 +170,6 @@ def discover_documents!(document_data, collect = false)
       url = data[:url]
       modification_date = data[:modification_date]
 
-      # Find existing document - one query per document but minimal memory usage
       existing_document = documents.find_by(url: url)
 
       ActiveRecord::Base.transaction do
@@ -235,6 +234,10 @@ def process_csv_documents(csv_path)
             urls.empty? ? nil : urls
           end
 
+          if row["crawl_date"].present? && row["crawl_date"].is_a?(String)
+            row["crawl_date"] = Time.parse(row["crawl_date"]).to_i
+          end
+
           documents << {
             url: row["url"],
             file_name: row["file_name"],
@@ -251,7 +254,9 @@ def process_csv_documents(csv_path)
             predicted_category_confidence: row["predicted_category_confidence"],
             number_of_pages: row["number_of_pages"]&.to_i,
             number_of_tables: row["number_of_tables"]&.to_i,
-            number_of_images: row["number_of_images"]&.to_i
+            number_of_images: row["number_of_images"]&.to_i,
+            crawl_status: row["crawl_status"].present? ? row["crawl_status"].capitalize : "",
+            crawl_date: row["crawl_date"]
           }
         rescue URI::InvalidURIError => e
           puts "Skipping invalid URL: #{row["url"]}"
@@ -353,12 +358,12 @@ def attributes_from(data)
       document_category: data[:predicted_category] || data[:document_category],
       document_category_confidence: data[:predicted_category_confidence] || data[:document_category_confidence],
       url: data[:url],
-      modification_date: data[:modification_date],
+      modification_date: clean_date(data[:modification_date]),
       file_size: data[:file_size],
       author: clean_string(data[:author]),
       subject: clean_string(data[:subject]),
       keywords: clean_string(data[:keywords]),
-      creation_date: data[:creation_date],
+      creation_date: clean_date(data[:creation_date]),
       producer: clean_string(data[:producer]),
       pdf_version: clean_string(data[:pdf_version]),
       source: if data[:source].nil?
@@ -369,7 +374,8 @@ def attributes_from(data)
       number_of_pages: data[:number_of_pages],
       number_of_tables: data[:number_of_tables],
       number_of_images: data[:number_of_images],
-      document_status: "discovered"
+      document_status: data[:crawl_status],
+      last_crawl_date: clean_date(data[:crawl_date])
     }
   end
 
@@ -378,6 +384,31 @@ def clean_string(str)
     str.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "").strip
   end
 
+  def clean_date(date)
+    if date.nil?
+      return nil
+    end
+    if date.is_a?(String)
+      return nil if date.empty?
+      Time.parse(date)
+    end
+    if date.is_a?(Integer)
+      case date
+      when 0..9_999_999_999
+        return Time.at(date)
+      when 10_000_000_000..9_999_999_999_999
+        return Time.at(date / 1000)
+      when 10_000_000_000_000..9_999_999_999_999_999
+        return Time.at(date / 1_000_000)
+      when 10_000_000_000_000_000..Float::INFINITY
+        return Time.at(date / 1_000_000_000)
+      else
+        return nil
+      end
+    end
+    date
+  end
+
   def ensure_safe_url
     return if primary_url.blank?
 
 
@@ -96,6 +96,18 @@
                         <i class="fas fa-file-pdf text-sm"></i> <%= document.file_name&.truncate(80) %>
                         <span class="sr-only">Show Document Modal</span>
                       </button>
+                      <% crawl_status = document.get_crawl_status_display %>
+                      <% if crawl_status.present? %>
+                        <% if crawl_status == Document::DOCUMENT_STATUS_NEW %>
+                          <div class="tooltip tooltip-top tooltip-primary" data-tip="Recently uploaded PDF - <%= document.last_crawl_date.present? ? document.last_crawl_date.strftime("%Y-%m-%d") : "Date Unknown" %>">
+                            <div class="badge badge-sm badge-primary"><%= crawl_status %></div>
+                          </div>
+                        <% else %>
+                          <div class="tooltip tooltip-top tooltip-neutral" data-tip="PDF removed - <%= document.last_crawl_date.present? ? document.last_crawl_date.strftime("%Y-%m-%d") : "Date Unknown"  %>">
+                            <div class="badge badge-sm badge-neutral"><%= crawl_status %></div>
+                          </div>
+                        <% end %>
+                      <% end %>
                     </div>
                     <% source = document_source(document.primary_source) %>
                     <div class="text-gray-400">
 
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+# Script to crawl a website and extract PDF information
+# Usage: bin/crawl <site_url> <output_dir> [previous_crawl_directory] [previous_link_json]
+# Site site_url must exist in python_components/crawler/config.json.
+# previous_crawl_directory is the path to a previous version of the crawl to compare against.
+# previous_link_json is a list of links and sources produced by the first phase of the crawler.
+#     Useful if metadata fetching failed or to resume an otherwise halted process.
+
+# Check if site_url parameter is provided
+if [ $# -eq 0 ]; then
+    echo "Error: site_url parameter is required"
+    echo "Usage: bin/crawl <site_url> <output_dir> [previous_crawl_directory] [previous_link_json]"
+    echo "Example: bin/crawl https://georgia.gov /db/seeds/site_documents_2025_09"
+    echo "Example: bin/crawl https://georgia.gov /db/seeds/site_documents_2025_09 /path/to/previous_crawl_directory/"
+    echo "Example: bin/crawl https://georgia.gov /db/seeds/site_documents_2025_09 /path/to/previous_crawl_directory.zip"
+    echo "Example: bin/crawl https://georgia.gov /db/seeds/site_documents_2025_09 /path/to/previous_crawl_directory.zip /georgia_files.json"
+    exit 1
+fi
+
+SITE_URL="$1"
+OUTPUT_DIR="$2"
+PREVIOUS_CRAWL="$3"
+PREVIOUS_LINK_JSON="$4"
+
+# Find the project root directory by looking for the bin directory
+# This allows the script to work from any subdirectory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+docker build -t asap_pdf:crawler "$PROJECT_ROOT/python_components/crawler/."
+docker build -t asap_pdf:classifier "$PROJECT_ROOT/python_components/classifier/."
+
+CONFIG_FILE="$PROJECT_ROOT/python_components/crawler/config.json"
+
+if [ ! -f "$CONFIG_FILE" ]; then
+    echo "Error: config.json not found at $CONFIG_FILE"
+    exit 1
+fi
+
+OUTPUT_FILE=$(jq -r --arg site_url "$SITE_URL" '
+    if has($site_url) then
+        .[$site_url].output_file
+    else
+        "output.csv"
+    end
+' "$CONFIG_FILE" 2>/dev/null || echo "output.csv")
+
+if [ -n "$PREVIOUS_CRAWL" ]; then
+    if [ ! -e "$PREVIOUS_CRAWL" ]; then
+        echo "Error: Previous crawl path '$PREVIOUS_CRAWL' does not exist"
+        exit 1
+    fi
+    CRAWLER_TMP_DIR="$PROJECT_ROOT/crawler_tmp"
+    PREVIOUS_CRAWL_DIR="$CRAWLER_TMP_DIR/previous_crawl"
+    mkdir -p $PREVIOUS_CRAWL_DIR
+    if [ -f "$PREVIOUS_CRAWL" ] && [[ "$PREVIOUS_CRAWL" == *.zip ]]; then
+        if command -v unzip >/dev/null 2>&1; then
+          unzip  -j -q "$PREVIOUS_CRAWL" -d "$PREVIOUS_CRAWL_DIR"
+          if [ $? -eq 0 ]; then
+              echo "Successfully extracted previous crawl archive"
+          else
+              echo "Error: Failed to extract zip archive"
+              rm -rf "$PREVIOUS_CRAWL_DIR"
+              exit 1
+          fi
+      else
+          echo "Error: unzip command not found. Please install unzip to extract archives"
+          rm -rf "$PREVIOUS_CRAWL_DIR"
+          exit 1
+      fi
+    else
+      cp -r "$PREVIOUS_CRAWL"/* "$PREVIOUS_CRAWL_DIR"
+    fi
+  COMPARISON_FLAG="--comparison_crawl=/data/previous_crawl/$OUTPUT_FILE"
+fi
+
+PREVIOUS_JSON_LINK_FLAG=""
+if [ -n "$PREVIOUS_LINK_JSON" ]; then
+  if [ ! -f "$PREVIOUS_LINK_JSON"  ]; then
+    echo "Previous JSON crawl file was specified, but does not exist."
+    exit 1
+  fi
+  mkdir -p "$CRAWLER_TMP_DIR/previous_json_links"
+  mv $PREVIOUS_LINK_JSON "$CRAWLER_TMP_DIR/previous_json_links"
+  PREVIOUS_JSON_LINK_FLAG="--crawled_links_json=/data/previous_json_links/$(basename $PREVIOUS_LINK_JSON)"
+fi
+
+TMP_OUTPUT="$CRAWLER_TMP_DIR/output"
+mkdir -p $TMP_OUTPUT
+mkdir -p $OUTPUT_DIR
+echo "$OUTPUT_DIR"
+
+set -x
+
+crawler_command=(
+    docker run --rm
+    -v "$PROJECT_ROOT/python_components/crawler:/workspace"
+    -v "$CRAWLER_TMP_DIR:/data"
+    asap_pdf:crawler
+    python /workspace/crawler.py "$SITE_URL" "/data/output/$OUTPUT_FILE"
+)
+
+if [ -n "$COMPARISON_FLAG" ]; then
+  crawler_command+=("$COMPARISON_FLAG")
+fi
+
+if [ -n "$PREVIOUS_JSON_LINK_FLAG" ]; then
+     crawler_command+=("$PREVIOUS_JSON_LINK_FLAG")
+fi
+
+"${crawler_command[@]}"
+
+set +x
+
+mv "$TMP_OUTPUT/$OUTPUT_FILE" "$TMP_OUTPUT/$OUTPUT_FILE-crawled"
+
+set -x
+
+docker run --rm \
+  -v "$PROJECT_ROOT/python_components/classifier:/workspace" \
+  -v "$TMP_OUTPUT:/output" \
+  asap_pdf:classifier \
+  python /workspace/classifier.py "/output/$OUTPUT_FILE-crawled" "/output/$OUTPUT_FILE"
+
+set +x
+
+mv "$TMP_OUTPUT/$OUTPUT_FILE" "$OUTPUT_DIR"
+
+rm -rf "$CRAWLER_TMP_DIR"
@@ -0,0 +1 @@
+7JOknLbj0E2yC0DwuXhJ2xQSnyyfeOmvwoUw4lRgNFBmPzMvfIXGGqF5TKB9wl6bK8FhA5meY8yTFCTIPFqvKq1WMnoRmcJqDa1wrhujvqrS9R4wExtYlAP7Q/bJyQVWK9D/VTo9gsUW9NeoeKQblqSEpE1PHw7X8w9gHztEygGxPAfYZJFfoGHQChzfMl9ac5sdCTEPOh/JuVJjBY4/dEJv75BEajlY6kv0zPn68frNnS8v/aJ6JMdp6UAbXjuzRYRD3E5rGWqQGxtCR1DQKL0RTFf3PWNemZainDfi7phDwZF2KaSiE2IlWks76fPYwNclxcW55o7FNm8MvpaWu3M4tIVJ4zwSNuq62j44gyUT4XYvUXyxkDf+ESr8Kw2bQrfN4CZMmaEWAw0cxHbj2Je0RTVhhTeu3aUX5qn1+0HBQdJ/gyWiskRFOFybfKb9QUgOSPWobz4hDfZnWqS8BHP0J0sKx2gzUY2svgjbPiK9WpdsWhtK1hIo--mlbA8M5tmXoKuCSX--xUIvSR9NnKJp1ht2UILLUw==
@@ -0,0 +1,9 @@
+class RemoveStatusColumn < ActiveRecord::Migration[8.0]
+  def up
+    remove_column :documents, :status
+  end
+
+  def down
+    add_column :documents, :status, :string
+  end
+end
@@ -0,0 +1,11 @@
+class AddLastCrawlDate < ActiveRecord::Migration[8.0]
+  def up
+    add_column :documents, :last_crawl_date, :datetime
+    change_column_default :documents, :document_status, from: nil, to: Document::DOCUMENT_STATUS_ACTIVE
+  end
+
+  def down
+    remove_column :documents, :last_crawl_date
+    change_column_default :documents, :document_status, from: Document::DOCUMENT_STATUS_ACTIVE, to: nil
+  end
+end
@@ -25,7 +25,7 @@ We've tested two approaches for Windows development:
 
 **Prerequisites:**
 - Clone the repository to your WSL home directory (this avoids file permission issues)
-- Install Yarn: `npm install yarn -y`
+- Install Yarn: `npm install yarn -g`
 
 **Configuration Steps:**
 
@@ -36,7 +36,9 @@ We've tested two approaches for Windows development:
    ```
 
 2. **Update database configuration:**
-    - Make sure Postgres credentials in `config/database.yml` match your Postgres setup
+    - If your Postgres service was set up to allow "trust" authentication for local connections, you may not need to change anything. 
+    - If you created a user with a password during database setup, add a username and password entry to the development section of `config/database.yml`. See the `staging` section for an example.
+
 
 3. **Install dependencies and setup database:**
    ```bash
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+7JOknLbj0E2yC0DwuXhJ2xQSnyyfeOmvwoUw4lRgNFBmPzMvfIXGGqF5TKB9wl6bK8FhA5meY8yTFCTIPFqvKq1WMnoRmcJqDa1wrhujvqrS9R4wExtYlAP7Q/bJyQVWK9D/VTo9gsUW9NeoeKQblqSEpE1PHw7X8w9gHztEygGxPAfYZJFfoGHQChzfMl9ac5sdCTEPOh/JuVJjBY4/dEJv75BEajlY6kv0zPn68frNnS8v/aJ6JMdp6UAbXjuzRYRD3E5rGWqQGxtCR1DQKL0RTFf3PWNemZainDfi7phDwZF2KaSiE2IlWks76fPYwNclxcW55o7FNm8MvpaWu3M4tIVJ4zwSNuq62j44gyUT4XYvUXyxkDf+ESr8Kw2bQrfN4CZMmaEWAw0cxHbj2Je0RTVhhTeu3aUX5qn1+0HBQdJ/gyWiskRFOFybfKb9QUgOSPWobz4hDfZnWqS8BHP0J0sKx2gzUY2svgjbPiK9WpdsWhtK1hIo--mlbA8M5tmXoKuCSX--xUIvSR9NnKJp1ht2UILLUw==