Merge pull request #153 from brave/dep-updates-and-logging

porteron · web-flow · commit 985f3345bb82 · 2025-04-23T07:18:19.000+07:00
Dep updates, logging, Makefile, local dev updates
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,8 @@ __pycache__/
 /sources.csv.json
 .pytest_cache/*
 .idea/
+articles_history.en_US.csv
+source_similarity_t10.en_US.json
+source_similarity_t10_hr.en_US.json
+feed.en_US.json
+sources.en_US.json
diff --git a/Makefile b/Makefile
@@ -0,0 +1,2 @@
+create-local-env:
+	./local-env.sh
diff --git a/README.md b/README.md
@@ -16,17 +16,35 @@ pip install -r requirements.txt
 - `paraphrase-multilingual-MiniLM-L12-v2` for non-english language sources.
 Once all source embeddings are generated, a pairwise source similarity matrix is produced.
 
+
+
+## Description
+There are two jobs involved in the generation of source suggestions. You can list them in EKS `source-suggestions-prod`.
+
+
+- **feed-accumulator** is a job that runs hourly that will fetch the feed.json for each locale then accumulate them all into a csv file then write back to S3. This output is available here https://brave-today-cdn.brave.com/source-suggestions/articles_history.en_US.csv and here. The articles_history file is only used by the backend job source-sim-matrix, the client does not use it.
+
+
+- **source-sim-matrix** is the other job, runs twice a week which will pull the articles_history csv and the publishers json from S3 then perform clustering on the article text and produce the source-suggestions json for each locale:
+  - https://brave-today-cdn.brave.com/source-suggestions/source_similarity_t10.en_US.json
+  - https://brave-today-cdn.brave.com/source-suggestions/source_similarity_t10_hr.en_US.json.
+
+Non English locales use a multilingual clustering model.  The browser will use this file to then determine which publishers to show in the suggested publisher cards in the feed, about every 7-8 cards you will see the suggestions.
+
+
 ## Running locally
 To collect and accumulate article history:
+
+Run this to download the files needed to run the script locally:
+```sh
+make create-local-env
 ```
-export NO_UPLOAD=1
-export NO_DOWNLOAD=1
-python source-feed-accumulator.py
+
+```sh
+NO_UPLOAD=1 NO_DOWNLOAD=1 python source-feed-accumulator.py
 ```
 
 To computed source embeddings and produce the source similarity matrix:
-```
-export NO_UPLOAD=1
-export NO_DOWNLOAD=1
-python sources-similarity-matrix.py
+```sh
+NO_UPLOAD=1 NO_DOWNLOAD=1 python source-similarity-matrix.py
 ```
diff --git a/embeddings.py b/embeddings.py
@@ -1,6 +1,7 @@
 import numpy as np
 from sentence_transformers import util
 from structlog import get_logger
+import time  # Add import for timing
 
 import config
 
@@ -17,15 +18,47 @@ def compute_source_similarity(source_1, source_2, function='cosine'):
 
 
 def get_source_representation_from_titles(titles, model):
-    if len(titles) < config.MINIMUM_ARTICLE_HISTORY_SIZE:
+    num_titles = len(titles)
+    logger.info("get_source_representation_from_titles called", num_titles=num_titles)
+
+    if num_titles < config.MINIMUM_ARTICLE_HISTORY_SIZE:
+       logger.warn(
+           "Not enough titles for source representation",
+           num_titles=num_titles,
+           min_required=config.MINIMUM_ARTICLE_HISTORY_SIZE
+       )
        return np.zeros((1, EMBEDDING_DIMENSIONALITY))
 
-    return model.encode(titles).mean(axis=0)
+    start_time = time.time()
+    embeddings = model.encode(titles)
+    end_time = time.time()
+    logger.info(
+        "Model encoding finished",
+        num_titles=num_titles,
+        duration_sec=round(end_time - start_time, 3)
+    )
+
+    return embeddings.mean(axis=0)
 
 
 def compute_source_representation_from_articles(articles_df, publisher_id, model):
+    logger.info(
+        "compute_source_representation_from_articles called",
+        publisher_id=publisher_id,
+        dataframe_shape=articles_df.shape
+    )
+
+    start_time = time.time()
     publisher_bucket_df = articles_df[articles_df.publisher_id == publisher_id]
+    end_time = time.time()
+    logger.info(
+        "DataFrame filtering finished",
+        publisher_id=publisher_id,
+        duration_sec=round(end_time - start_time, 3),
+        filtered_shape=publisher_bucket_df.shape
+    )
 
     titles = [
         title for title in publisher_bucket_df.title.to_numpy() if title is not None]
+    # Pass the model to the helper function for encoding
     return get_source_representation_from_titles(titles, model)
diff --git a/local-env.sh b/local-env.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Remove existing virtual environment
+rm -rf .venv
+
+# Create a new virtual environment
+python3 -m venv .venv
+
+# Activate the virtual environment
+source .venv/bin/activate
+
+# Ensure the correct Python version is being used
+pyenv global 3.9.11
+eval "$(pyenv init --path)"
+
+# Install the required packages
+echo "Install requirements"
+pip install -r requirements.txt
+
+# Print completion messages
+echo "---------------------------"
+echo ".venv recreated and sourced"
+echo "Set python version to 3.9.11"
+echo "Installed requirements"
+echo "Complete"
+echo "---------------------------"
+
+# download these files
+urls=(
+  "https://brave-today-cdn.brave.com/brave-today/feed.en_US.json"
+  "https://brave-today-cdn.brave.com/source-suggestions/articles_history.en_US.csv"
+  "https://brave-today-cdn.brave.com/sources.en_US.json"
+)
+
+for url in "${urls[@]}"; do
+  # Extract filename from URL
+  filename=$(basename "$url")
+
+  # Download the file using wget
+  wget -O "$filename" "$url"
+
+  # Check if download was successful
+  if [ $? -eq 0 ]; then
+    echo "Successfully downloaded: $filename"
+  else
+    echo "Failed to download: $filename"
+  fi
+done
+
+
+# Keep the virtual environment active
+exec "$SHELL"
diff --git a/requirements.txt b/requirements.txt
@@ -3,12 +3,11 @@ numpy==1.23.5
 pandas==1.5.1
 requests==2.32.3
 scipy==1.10.0
-sentence-transformers==2.7.0
-sentry-sdk==2.8.0
+sentence-transformers==3.0.1
+sentry-sdk==1.45.0
 tqdm==4.66.4
 boto3==1.26.14
 botocore==1.29.14
 structlog==23.3.0
 torch==2.6.0
-torchvision==0.21.0
-transformers==4.51.3
+transformers==4.48.0
diff --git a/source-feed-accumulator.py b/source-feed-accumulator.py
@@ -14,8 +14,8 @@
 def sanitize_articles_history(lang_region):
     articles_history_df = pd.read_csv(config.OUTPUT_DIR + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region))
     articles_history_df = articles_history_df.drop_duplicates().dropna()
-    cutoff_date = pd.Timestamp.now().normalize() - pd.Timedelta(days=3*31)
-    # purge articles older than 3 months
+    cutoff_date = pd.Timestamp.now().normalize() - pd.Timedelta(days=2*31)
+    # purge articles older than 2 months
     articles_history_df = articles_history_df[pd.to_datetime(
         articles_history_df.iloc[:, 2]) > cutoff_date]
     articles_history_df.to_csv(config.OUTPUT_DIR + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), index=False)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+create-local-env:`
	`2`	`+ ./local-env.sh`