Skip to content

Add historical eras based on year #50

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ env/
Docker/__pycache__

venv/

backend/4300_venv
*.pyc
__pycache__/

Expand All @@ -17,4 +17,8 @@ dist/
build/
*.egg-info/
helpers/*
json_template/
json_template/
.env

backend/data/scripts
backend/data/scripts/*
103 changes: 75 additions & 28 deletions backend/app.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,93 @@
import json
import os
from flask import Flask, render_template, request
import numpy as np
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
from helpers.MySQLDatabaseHandler import MySQLDatabaseHandler
import pandas as pd

# ROOT_PATH for linking with all your files.
# Feel free to use a config.py or settings.py with a global export variable
os.environ['ROOT_PATH'] = os.path.abspath(os.path.join("..",os.curdir))

# Get the directory of the current script
current_directory = os.path.dirname(os.path.abspath(__file__))
from processor import WeightedTfidfProcessor
from filters import Filters

os.environ['ROOT_PATH'] = os.path.abspath(os.path.join("..", os.curdir))


def assign_era(year_str):
try:
year_str = str(year_str).strip()
if 'BC' in year_str.upper():
year_num = -int(year_str.replace('BC', '').strip())
elif 'AD' in year_str.upper():
year_num = int(year_str.replace('AD', '').strip())
else:
year_num = int(year_str)
except:
return "Unknown"

if year_num <= -3000:
return "Prehistoric"
elif year_num <= -1000:
return "Bronze Age"
elif year_num <= 0:
return "Iron Age / Classical"
elif year_num <= 500:
return "Classical Antiquity"
elif year_num <= 1500:
return "Medieval"
elif year_num <= 1800:
return "Early Modern"
elif year_num <= 1945:
return "Colonial / Industrial"
else:
return "Contemporary"

# Specify the path to the JSON file relative to the current script

current_directory = os.path.dirname(os.path.abspath(__file__))
json_file_path = os.path.join(current_directory, 'init.json')
csv_file_path = os.path.join(current_directory, 'data', 'final_data.csv')

historical_df = pd.read_csv(csv_file_path)
historical_df['era'] = historical_df['Year'].apply(assign_era)

weight_processor = WeightedTfidfProcessor(
historical_df.to_dict('records'),
weight_factor=1
)


def tfidf_search(query, top_n=5):
return weight_processor.search(query, top_n=top_n)

# Assuming your JSON data is stored in a file named 'init.json'
with open(json_file_path, 'r') as file:
data = json.load(file)
episodes_df = pd.DataFrame(data['episodes'])
reviews_df = pd.DataFrame(data['reviews'])

app = Flask(__name__)
CORS(app)

# Sample search using json with pandas
def json_search(query):
matches = []
merged_df = pd.merge(episodes_df, reviews_df, left_on='id', right_on='id', how='inner')
matches = merged_df[merged_df['title'].str.lower().str.contains(query.lower())]
matches_filtered = matches[['title', 'descr', 'imdb_rating']]
matches_filtered_json = matches_filtered.to_json(orient='records')
return matches_filtered_json

@app.route("/")
def home():
return render_template('base.html',title="sample html")
mapbox_token = os.environ.get('MAPBOX_ACCESS_TOKEN')

return render_template('base.html', title="World Heritage Explorer", mapbox_token=mapbox_token)


@app.route("/historical-sites")
def historical_search():
query = request.args.get("query", "")
min_year = request.args.get("minYear", "2500BC")
max_year = request.args.get("maxYear", "2012")

if not query:
return jsonify([])

results = tfidf_search(query)
filtered_results = Filters(
results,
min_year,
max_year
).filter_by_year()

return jsonify(filtered_results)

@app.route("/episodes")
def episodes_search():
text = request.args.get("title")
return json_search(text)

if 'DB_NAME' not in os.environ:
app.run(debug=True,host="0.0.0.0",port=5000)
if __name__ == '__main__':
app.run(debug=True, host="0.0.0.0", port=8080)
1,148 changes: 1,148 additions & 0 deletions backend/data/final_data.csv

Large diffs are not rendered by default.

1,148 changes: 1,148 additions & 0 deletions backend/data/historical_events.csv

Large diffs are not rendered by default.

1,148 changes: 1,148 additions & 0 deletions backend/data/historical_events_geocoded.csv

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions backend/filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pandas as pd
import numpy as np

class Filters:
def __init__(self, data, min_year: str, max_year: str):
self.data = data
self.min_year = self.transform_years(min_year)
self.max_year = self.transform_years(max_year)
print(f"min_year: {self.min_year}, max_year: {self.max_year}")

def transform_years(self, year):
if isinstance(year, str) and "BC" in year:
year = -1 * int(year.replace("BC", "").strip())
else:
try:
year = int(year)
except (ValueError, TypeError):
year = np.nan
return year

def filter_by_year(self):
if not isinstance(self.data, list):
print("Warning: Expected a list of records")
return self.data

filtered_records = []
for record in self.data:
if isinstance(record, dict) and 'row' in record:
nested_row = record['row']
if isinstance(nested_row, dict) and 'Year' in nested_row:
normalized_year = self.transform_years(nested_row['Year'])
if not np.isnan(normalized_year) and self.min_year <= normalized_year <= self.max_year:
filtered_records.append(record)

return filtered_records
94 changes: 94 additions & 0 deletions backend/processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

class WeightedTfidfProcessor:
def __init__(self, rows, weight_fields=None, weight_factor=2.0):
self.rows = rows
self.weight_fields = weight_fields or ['Name of Incident', 'description']
self.weight_factor = weight_factor
self.vectorizer = TfidfVectorizer(
lowercase=True,
stop_words='english',
max_features=5000,
token_pattern=r'\b[a-zA-Z]{3,}\b'
)

self.corpus = self._prepare_weighted_corpus()
self.tfidf_matrix = self.vectorizer.fit_transform(self.corpus)
self.feature_names = self.vectorizer.get_feature_names_out()

self.doc_labels = [
f"{row.get('Name of Incident', 'Unknown')} ({row.get('Place Name', 'Unknown')})"
for row in self.rows
]

def _prepare_weighted_corpus(self):
corpus = []

for row in self.rows:
document_parts = []

for key, value in row.items():
if value and isinstance(value, (str, int, float)):
document_parts.append(str(value))

for field in self.weight_fields:
repetitions = int(self.weight_factor) - 1
for _ in range(repetitions):
if field in row and row[field]:
document_parts.append(str(row[field]))

corpus.append(" ".join(document_parts))

return corpus

def get_top_terms(self, n=10):
top_terms = []

for i, label in enumerate(self.doc_labels):
tfidf_scores = self.tfidf_matrix[i].toarray().flatten()
sorted_indices = np.argsort(tfidf_scores)[::-1]

document_terms = {}
for idx in sorted_indices[:n]:
if tfidf_scores[idx] > 0:
term = self.feature_names[idx]
score = tfidf_scores[idx]
document_terms[term] = score

top_terms.append({
'document': label,
'top_terms': document_terms
})

return top_terms

def get_document_similarity(self):
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(self.tfidf_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=self.doc_labels, columns=self.doc_labels)

return similarity_df

def search(self, query, top_n=5):
from sklearn.metrics.pairwise import cosine_similarity

query_vector = self.vectorizer.transform([query])
similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()

top_indices = similarities.argsort()[::-1][:top_n]

results = []
for idx in top_indices:
row = self.rows[idx]
score = similarities[idx]
if score > 0:
results.append({
'document': self.doc_labels[idx],
'score': score,
'row': row
})

return results
90 changes: 64 additions & 26 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,26 +1,64 @@
cffi==1.15.1
click==8.1.3
colorama==0.4.6
cryptography==39.0.2
Flask==2.2.2
Flask-Cors==3.0.10
gitdb==4.0.10
GitPython==3.1.30
greenlet>=2.0.2
gunicorn==20.1.0
itsdangerous==2.1.2
Jinja2==3.1.2
MarkupSafe==2.1.2
numpy>=1.26.4
pandas>=2.2.1
protobuf==3.20.3
pycparser==2.21
PyMySQL==1.0.2
python-dateutil==2.9.0.post0
pytz==2024.1
six==1.16.0
smmap==5.0.0
SQLAlchemy==1.4.46
typing_extensions==4.5.0
tzdata==2024.1
Werkzeug==2.2.2
affine==2.4.0
annotated-types==0.7.0
anyio==4.8.0
attrs==25.1.0
beautifulsoup4==4.13.3
blinker==1.9.0
certifi==2025.1.31
charset-normalizer==3.4.1
click==8.1.8
click-plugins==1.1.1
cligj==0.7.2
contextily==1.6.2
contourpy==1.3.1
cycler==0.12.1
distro==1.9.0
Flask==3.1.0
flask-cors==5.0.1
fonttools==4.56.0
geographiclib==2.0
geopandas==1.0.1
geopy==2.4.1
h11==0.14.0
httpcore==1.0.7
httpx==0.28.1
idna==3.10
itsdangerous==2.2.0
Jinja2==3.1.6
jiter==0.9.0
joblib==1.4.2
kiwisolver==1.4.8
MarkupSafe==3.0.2
matplotlib==3.10.1
mercantile==1.2.1
numpy==2.2.3
openai==1.66.3
packaging==24.2
pandas==2.2.3
pillow==11.1.0
pydantic==2.10.6
pydantic_core==2.27.2
pyogrio==0.10.0
pyparsing==3.2.1
pyproj==3.7.1
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
pytz==2025.1
rasterio==1.4.3
requests==2.32.3
scikit-learn==1.6.1
scipy==1.15.2
seaborn==0.13.2
shapely==2.0.7
six==1.17.0
sniffio==1.3.1
soupsieve==2.6
threadpoolctl==3.6.0
tqdm==4.67.1
typing_extensions==4.12.2
tzdata==2025.1
urllib3==2.3.0
Werkzeug==3.1.3
wikipedia==1.4.0
Wikipedia-API==0.8.1
xyzservices==2025.1.0
Loading