CornellNLP · trishasaini · Mar 18, 2025 · Mar 18, 2025 · Mar 20, 2025 · Mar 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -3,7 +3,7 @@ env/
 Docker/__pycache__
 
 venv/
-
+backend/4300_venv
 *.pyc
 __pycache__/
 
@@ -17,4 +17,8 @@ dist/
 build/
 *.egg-info/
 helpers/*
-json_template/
+json_template/
+.env
+
+backend/data/scripts
+backend/data/scripts/*
diff --git a/backend/app.py b/backend/app.py
@@ -1,46 +1,93 @@
 import json
 import os
-from flask import Flask, render_template, request
+import numpy as np
+from flask import Flask, render_template, request, jsonify
 from flask_cors import CORS
-from helpers.MySQLDatabaseHandler import MySQLDatabaseHandler
 import pandas as pd
 
-# ROOT_PATH for linking with all your files. 
-# Feel free to use a config.py or settings.py with a global export variable
-os.environ['ROOT_PATH'] = os.path.abspath(os.path.join("..",os.curdir))
 
-# Get the directory of the current script
-current_directory = os.path.dirname(os.path.abspath(__file__))
+from processor import WeightedTfidfProcessor
+from filters import Filters
+
+os.environ['ROOT_PATH'] = os.path.abspath(os.path.join("..", os.curdir))
+
+
+def assign_era(year_str):
+    try:
+        year_str = str(year_str).strip()
+        if 'BC' in year_str.upper():
+            year_num = -int(year_str.replace('BC', '').strip())
+        elif 'AD' in year_str.upper():
+            year_num = int(year_str.replace('AD', '').strip())
+        else:
+            year_num = int(year_str)
+    except:
+        return "Unknown"
+
+    if year_num <= -3000:
+        return "Prehistoric"
+    elif year_num <= -1000:
+        return "Bronze Age"
+    elif year_num <= 0:
+        return "Iron Age / Classical"
+    elif year_num <= 500:
+        return "Classical Antiquity"
+    elif year_num <= 1500:
+        return "Medieval"
+    elif year_num <= 1800:
+        return "Early Modern"
+    elif year_num <= 1945:
+        return "Colonial / Industrial"
+    else:
+        return "Contemporary"
 
-# Specify the path to the JSON file relative to the current script
+
+current_directory = os.path.dirname(os.path.abspath(__file__))
 json_file_path = os.path.join(current_directory, 'init.json')
+csv_file_path = os.path.join(current_directory, 'data', 'final_data.csv')
+
+historical_df = pd.read_csv(csv_file_path)
+historical_df['era'] = historical_df['Year'].apply(assign_era)
+
+weight_processor = WeightedTfidfProcessor(
+    historical_df.to_dict('records'),
+    weight_factor=1
+)
+
+
+def tfidf_search(query, top_n=5):
+    return weight_processor.search(query, top_n=top_n)
 
-# Assuming your JSON data is stored in a file named 'init.json'
-with open(json_file_path, 'r') as file:
-    data = json.load(file)
-    episodes_df = pd.DataFrame(data['episodes'])
-    reviews_df = pd.DataFrame(data['reviews'])
 
 app = Flask(__name__)
 CORS(app)
 
-# Sample search using json with pandas
-def json_search(query):
-    matches = []
-    merged_df = pd.merge(episodes_df, reviews_df, left_on='id', right_on='id', how='inner')
-    matches = merged_df[merged_df['title'].str.lower().str.contains(query.lower())]
-    matches_filtered = matches[['title', 'descr', 'imdb_rating']]
-    matches_filtered_json = matches_filtered.to_json(orient='records')
-    return matches_filtered_json
 
 @app.route("/")
 def home():
-    return render_template('base.html',title="sample html")
+    mapbox_token = os.environ.get('MAPBOX_ACCESS_TOKEN')
+
+    return render_template('base.html', title="World Heritage Explorer", mapbox_token=mapbox_token)
+
+
+@app.route("/historical-sites")
+def historical_search():
+    query = request.args.get("query", "")
+    min_year = request.args.get("minYear", "2500BC")
+    max_year = request.args.get("maxYear", "2012")
+
+    if not query:
+        return jsonify([])
+
+    results = tfidf_search(query)
+    filtered_results = Filters(
+        results,
+        min_year,
+        max_year
+    ).filter_by_year()
+
+    return jsonify(filtered_results)
 
-@app.route("/episodes")
-def episodes_search():
-    text = request.args.get("title")
-    return json_search(text)
 
-if 'DB_NAME' not in os.environ:
-    app.run(debug=True,host="0.0.0.0",port=5000)
+if __name__ == '__main__':
+    app.run(debug=True, host="0.0.0.0", port=8080)
diff --git a/backend/data/final_data.csv b/backend/data/final_data.csv
diff --git a/backend/data/historical_events.csv b/backend/data/historical_events.csv
diff --git a/backend/data/historical_events_geocoded.csv b/backend/data/historical_events_geocoded.csv
diff --git a/backend/filters.py b/backend/filters.py
@@ -0,0 +1,35 @@
+import pandas as pd
+import numpy as np
+
+class Filters:
+    def __init__(self, data, min_year: str, max_year: str):
+        self.data = data
+        self.min_year = self.transform_years(min_year)
+        self.max_year = self.transform_years(max_year)
+        print(f"min_year: {self.min_year}, max_year: {self.max_year}")
+
+    def transform_years(self, year):
+        if isinstance(year, str) and "BC" in year:
+            year = -1 * int(year.replace("BC", "").strip())
+        else:
+            try:
+                year = int(year)
+            except (ValueError, TypeError):
+                year = np.nan
+        return year
+
+    def filter_by_year(self):
+        if not isinstance(self.data, list):
+            print("Warning: Expected a list of records")
+            return self.data
+
+        filtered_records = []
+        for record in self.data:
+            if isinstance(record, dict) and 'row' in record:
+                nested_row = record['row']
+                if isinstance(nested_row, dict) and 'Year' in nested_row:
+                    normalized_year = self.transform_years(nested_row['Year'])
+                    if not np.isnan(normalized_year) and self.min_year <= normalized_year <= self.max_year:
+                        filtered_records.append(record)
+
+        return filtered_records
diff --git a/backend/processor.py b/backend/processor.py
@@ -0,0 +1,94 @@
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+import pandas as pd
+
+class WeightedTfidfProcessor:
+    def __init__(self, rows, weight_fields=None, weight_factor=2.0):
+        self.rows = rows
+        self.weight_fields = weight_fields or ['Name of Incident', 'description']
+        self.weight_factor = weight_factor
+        self.vectorizer = TfidfVectorizer(
+            lowercase=True,
+            stop_words='english',
+            max_features=5000,
+            token_pattern=r'\b[a-zA-Z]{3,}\b'
+        )
+
+        self.corpus = self._prepare_weighted_corpus()
+        self.tfidf_matrix = self.vectorizer.fit_transform(self.corpus)
+        self.feature_names = self.vectorizer.get_feature_names_out()
+
+        self.doc_labels = [
+            f"{row.get('Name of Incident', 'Unknown')} ({row.get('Place Name', 'Unknown')})"
+            for row in self.rows
+        ]
+
+    def _prepare_weighted_corpus(self):
+        corpus = []
+
+        for row in self.rows:
+            document_parts = []
+
+            for key, value in row.items():
+                if value and isinstance(value, (str, int, float)):
+                    document_parts.append(str(value))
+
+            for field in self.weight_fields:
+                repetitions = int(self.weight_factor) - 1
+                for _ in range(repetitions):
+                    if field in row and row[field]:
+                        document_parts.append(str(row[field]))
+
+            corpus.append(" ".join(document_parts))
+
+        return corpus
+
+    def get_top_terms(self, n=10):
+        top_terms = []
+
+        for i, label in enumerate(self.doc_labels):
+            tfidf_scores = self.tfidf_matrix[i].toarray().flatten()
+            sorted_indices = np.argsort(tfidf_scores)[::-1]
+
+            document_terms = {}
+            for idx in sorted_indices[:n]:
+                if tfidf_scores[idx] > 0:
+                    term = self.feature_names[idx]
+                    score = tfidf_scores[idx]
+                    document_terms[term] = score
+
+            top_terms.append({
+                'document': label,
+                'top_terms': document_terms
+            })
+
+        return top_terms
+
+    def get_document_similarity(self):
+        from sklearn.metrics.pairwise import cosine_similarity
+
+        similarity_matrix = cosine_similarity(self.tfidf_matrix)
+        similarity_df = pd.DataFrame(similarity_matrix, index=self.doc_labels, columns=self.doc_labels)
+
+        return similarity_df
+
+    def search(self, query, top_n=5):
+        from sklearn.metrics.pairwise import cosine_similarity
+
+        query_vector = self.vectorizer.transform([query])
+        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
+
+        top_indices = similarities.argsort()[::-1][:top_n]
+
+        results = []
+        for idx in top_indices:
+            row = self.rows[idx]
+            score = similarities[idx]
+            if score > 0:
+                results.append({
+                    'document': self.doc_labels[idx],
+                    'score': score,
+                    'row': row
+                })
+
+        return results
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,26 +1,64 @@
-cffi==1.15.1
-click==8.1.3
-colorama==0.4.6
-cryptography==39.0.2
-Flask==2.2.2
-Flask-Cors==3.0.10
-gitdb==4.0.10
-GitPython==3.1.30
-greenlet>=2.0.2
-gunicorn==20.1.0
-itsdangerous==2.1.2
-Jinja2==3.1.2
-MarkupSafe==2.1.2
-numpy>=1.26.4
-pandas>=2.2.1
-protobuf==3.20.3
-pycparser==2.21
-PyMySQL==1.0.2
-python-dateutil==2.9.0.post0
-pytz==2024.1
-six==1.16.0
-smmap==5.0.0
-SQLAlchemy==1.4.46
-typing_extensions==4.5.0
-tzdata==2024.1
-Werkzeug==2.2.2
+affine==2.4.0
+annotated-types==0.7.0
+anyio==4.8.0
+attrs==25.1.0
+beautifulsoup4==4.13.3
+blinker==1.9.0
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+click-plugins==1.1.1
+cligj==0.7.2
+contextily==1.6.2
+contourpy==1.3.1
+cycler==0.12.1
+distro==1.9.0
+Flask==3.1.0
+flask-cors==5.0.1
+fonttools==4.56.0
+geographiclib==2.0
+geopandas==1.0.1
+geopy==2.4.1
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+idna==3.10
+itsdangerous==2.2.0
+Jinja2==3.1.6
+jiter==0.9.0
+joblib==1.4.2
+kiwisolver==1.4.8
+MarkupSafe==3.0.2
+matplotlib==3.10.1
+mercantile==1.2.1
+numpy==2.2.3
+openai==1.66.3
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+pydantic==2.10.6
+pydantic_core==2.27.2
+pyogrio==0.10.0
+pyparsing==3.2.1
+pyproj==3.7.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2025.1
+rasterio==1.4.3
+requests==2.32.3
+scikit-learn==1.6.1
+scipy==1.15.2
+seaborn==0.13.2
+shapely==2.0.7
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+threadpoolctl==3.6.0
+tqdm==4.67.1
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==2.3.0
+Werkzeug==3.1.3
+wikipedia==1.4.0
+Wikipedia-API==0.8.1
+xyzservices==2025.1.0