diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..4b6a115 --- /dev/null +++ b/.flake8 @@ -0,0 +1,8 @@ +[flake8] +ignore = E203, W503, F541 +max-line-length = 120 +max-doc-length = 120 +max-complexity = 10 +exclude = .venv,cookiecutter,.git,.local,.idea,.mypy_cache,.pytest_cache +per-file-ignores = + per-file-ignores = __init__.py:F401 diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml new file mode 100644 index 0000000..8b71f65 --- /dev/null +++ b/.github/workflows/pull_request.yml @@ -0,0 +1,21 @@ +name: Test + +on: + pull_request + +jobs: + pull_request: + name: Test + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Python Version + uses: actions/setup-python@v4 + with: + python-version: 3.9 + cache: 'pip' # caching pip dependencies + - name: Install Python dependencies + run: pip install -r requirements.dev.txt + - name: Run pre-commit + uses: pre-commit/action@v3.0.0 diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index b1dd87d..61f39ae 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -15,7 +15,7 @@ jobs: - name: Checkout uses: actions/checkout@v3 - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v1 + uses: aws-actions/configure-aws-credentials@v2 with: aws-access-key-id: ${{ secrets.GDBP_AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.GDBP_AWS_SECRET_ACCESS_KEY }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..a122e22 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,32 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-ast + - id: requirements-txt-fixer + - id: check-docstring-first + +- repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black + +- repo: https://github.com/PyCQA/flake8 + rev: 6.0.0 + hooks: + - id: flake8 + +- repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--profile", "black"] + +- repo: https://github.com/PyCQA/bandit + rev: 1.7.5 + hooks: + - id: bandit + entry: bandit --quiet -r -x tests/ src/*.py diff --git a/SECURITY.md b/SECURITY.md index f13c6b8..f1f794b 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,4 +2,4 @@ ## Reporting a Vulnerability -See https://hackerone.com/brave for details. \ No newline at end of file +See https://hackerone.com/brave for details. diff --git a/config.py b/config.py index 6e8ec15..060454e 100644 --- a/config.py +++ b/config.py @@ -1,44 +1,55 @@ import os # Disable uploads to S3. Useful when running locally or in CI. -NO_UPLOAD = os.getenv('NO_UPLOAD', None) -NO_DOWNLOAD = os.getenv('NO_DOWNLOAD', None) +NO_UPLOAD = os.getenv("NO_UPLOAD", None) +NO_DOWNLOAD = os.getenv("NO_DOWNLOAD", None) -PCDN_URL_BASE = os.getenv('PCDN_URL_BASE', 'https://pcdn.brave.software') -PUB_S3_BUCKET = os.getenv('PUB_S3_BUCKET', 'brave-today-cdn-development') +PCDN_URL_BASE = os.getenv("PCDN_URL_BASE", "https://pcdn.brave.software") +PUB_S3_BUCKET = os.getenv("PUB_S3_BUCKET", "brave-today-cdn-development") # Canonical ID of the public S3 bucket -BRAVE_TODAY_CANONICAL_ID = os.getenv('BRAVE_TODAY_CANONICAL_ID', None) -BRAVE_TODAY_CLOUDFRONT_CANONICAL_ID = os.getenv('BRAVE_TODAY_CLOUDFRONT_CANONICAL_ID', None) - -LANG_REGION_MODEL_MAP = os.getenv('LANG_REGION_MODEL_MAP', [ - ('en_US', "sentence-transformers/all-MiniLM-L6-v2"), - ('en_CA', "sentence-transformers/all-MiniLM-L6-v2"), - ('en_GB', "sentence-transformers/all-MiniLM-L6-v2"), - ('es_ES', "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), - ('es_MX', "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), - ('pt_BR', "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), - ('ja_JP', "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), - ('de_DE', "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), - ('fr_FR', "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), - ('en_AU', "sentence-transformers/all-MiniLM-L6-v2"), - ('en_IN', "sentence-transformers/all-MiniLM-L6-v2"), - -]) - -SOURCES_JSON_FILE = os.getenv('SOURCES_JSON_FILE', 'sources.{LANG_REGION}') -FEED_JSON_FILE = os.getenv('FEED_JSON_FILE', 'feed.{LANG_REGION}') - -OUTPUT_DIR = os.getenv('OUTPUT_DIR', 'output') - -ARTICLE_HISTORY_FILE = os.getenv('ARTICLE_HISTORY_FILE', "articles_history.{LANG_REGION}.csv") +BRAVE_TODAY_CANONICAL_ID = os.getenv("BRAVE_TODAY_CANONICAL_ID", None) +BRAVE_TODAY_CLOUDFRONT_CANONICAL_ID = os.getenv( + "BRAVE_TODAY_CLOUDFRONT_CANONICAL_ID", None +) + +LANG_REGION_MODEL_MAP = os.getenv( + "LANG_REGION_MODEL_MAP", + [ + ("en_US", "sentence-transformers/all-MiniLM-L6-v2"), + ("en_CA", "sentence-transformers/all-MiniLM-L6-v2"), + ("en_GB", "sentence-transformers/all-MiniLM-L6-v2"), + ("es_ES", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), + ("es_MX", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), + ("pt_BR", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), + ("ja_JP", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), + ("de_DE", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), + ("fr_FR", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), + ("en_AU", "sentence-transformers/all-MiniLM-L6-v2"), + ("en_IN", "sentence-transformers/all-MiniLM-L6-v2"), + ], +) + +SOURCES_JSON_FILE = os.getenv("SOURCES_JSON_FILE", "sources.{LANG_REGION}") +FEED_JSON_FILE = os.getenv("FEED_JSON_FILE", "feed.{LANG_REGION}") + +OUTPUT_DIR = os.getenv("OUTPUT_DIR", "output") + +ARTICLE_HISTORY_FILE = os.getenv( + "ARTICLE_HISTORY_FILE", "articles_history.{LANG_REGION}.csv" +) # Don't compute the embedding for a source that has less than 30 collected articles -MINIMUM_ARTICLE_HISTORY_SIZE = os.getenv('MINIMUM_ARTICLE_HISTORY_SIZE', 30) -SIMILARITY_CUTOFF_RATIO = os.getenv('SIMILARITY_CUTOFF_RATIO', 0.9) -SOURCE_SIMILARITY_T10 = os.getenv('SOURCE_SIMILARITY_T10', "source_similarity_t10.{LANG_REGION}") -SOURCE_SIMILARITY_T10_HR = os.getenv('SOURCE_SIMILARITY_T10_HR', "source_similarity_t10_hr.{LANG_REGION}") - -SOURCE_EMBEDDINGS = os.getenv('SOURCE_EMBEDDINGS', "SOURCE_EMBEDDINGS.{LANG_REGION}") - -if SENTRY_URL := os.getenv('SENTRY_URL'): +MINIMUM_ARTICLE_HISTORY_SIZE = os.getenv("MINIMUM_ARTICLE_HISTORY_SIZE", 30) +SIMILARITY_CUTOFF_RATIO = os.getenv("SIMILARITY_CUTOFF_RATIO", 0.9) +SOURCE_SIMILARITY_T10 = os.getenv( + "SOURCE_SIMILARITY_T10", "source_similarity_t10.{LANG_REGION}" +) +SOURCE_SIMILARITY_T10_HR = os.getenv( + "SOURCE_SIMILARITY_T10_HR", "source_similarity_t10_hr.{LANG_REGION}" +) + +SOURCE_EMBEDDINGS = os.getenv("SOURCE_EMBEDDINGS", "SOURCE_EMBEDDINGS.{LANG_REGION}") + +if SENTRY_URL := os.getenv("SENTRY_URL"): import sentry_sdk + sentry_sdk.init(dsn=SENTRY_URL, traces_sample_rate=0) diff --git a/embeddings.py b/embeddings.py index bf62d94..b80c546 100644 --- a/embeddings.py +++ b/embeddings.py @@ -9,16 +9,16 @@ logger = get_logger() -def compute_source_similarity(source_1, source_2, function='cosine'): - if function == 'dot': +def compute_source_similarity(source_1, source_2, function="cosine"): + if function == "dot": return util.dot_score(source_1, np.transpose(source_2)) - elif function == 'cosine': + elif function == "cosine": return util.pytorch_cos_sim(source_1, source_2)[0][0] def get_source_representation_from_titles(titles, model): if len(titles) < config.MINIMUM_ARTICLE_HISTORY_SIZE: - return np.zeros((1, EMBEDDING_DIMENSIONALITY)) + return np.zeros((1, EMBEDDING_DIMENSIONALITY)) return model.encode(titles).mean(axis=0) @@ -27,5 +27,6 @@ def compute_source_representation_from_articles(articles_df, publisher_id, model publisher_bucket_df = articles_df[articles_df.publisher_id == publisher_id] titles = [ - title for title in publisher_bucket_df.title.to_numpy() if title is not None] + title for title in publisher_bucket_df.title.to_numpy() if title is not None + ] return get_source_representation_from_titles(titles, model) diff --git a/renovate.json b/renovate.json index 39a2b6e..8e32181 100644 --- a/renovate.json +++ b/renovate.json @@ -2,5 +2,23 @@ "$schema": "https://docs.renovatebot.com/renovate-schema.json", "extends": [ "config:base" + ], + "schedule": [ + "every 7 days" + ], + "baseBranches": [ + "master" + ], + "pre-commit": { + "enabled": true + }, + "pip_requirements": { + "fileMatch": ["requirements.*"] + }, + "packageRules": [ + { + "packagePatterns": ["^regex$"], + "enabled": false + } ] } diff --git a/requirements.dev.txt b/requirements.dev.txt new file mode 100644 index 0000000..0839e1f --- /dev/null +++ b/requirements.dev.txt @@ -0,0 +1,10 @@ +-r requirements.txt +bandit==1.7.5 +black==23.7.0 +bpython==0.24 +flake8==6.1.0 +isort==5.12.0 +pip-check-reqs==2.4.4 +pre-commit==3.3.3 +pylint==2.17.5 +pytest==7.4.0 diff --git a/requirements.txt b/requirements.txt index 4269242..9a0fcdc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +boto3==1.26.14 +botocore==1.29.14 feedparser==6.0.10 numpy==1.23.5 pandas==1.5.1 @@ -5,10 +7,8 @@ requests==2.31.0 scipy==1.10.0 sentence-transformers==2.2.2 sentry-sdk==1.29.2 -tqdm==4.66.1 -boto3==1.26.14 -botocore==1.29.14 structlog==23.1.0 torch==2.0.1 torchvision==0.15.2 +tqdm==4.66.1 transformers==4.31.0 diff --git a/source-feed-accumulator.py b/source-feed-accumulator.py index d0b15ed..721ffd4 100644 --- a/source-feed-accumulator.py +++ b/source-feed-accumulator.py @@ -12,34 +12,52 @@ def sanitize_articles_history(lang_region): - articles_history_df = pd.read_csv(config.OUTPUT_DIR + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region)) + articles_history_df = pd.read_csv( + config.OUTPUT_DIR + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region) + ) articles_history_df = articles_history_df.drop_duplicates().dropna() - articles_history_df.to_csv(config.OUTPUT_DIR + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), index=False) + articles_history_df.to_csv( + config.OUTPUT_DIR + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), + index=False, + ) def accumulate_articles(articles, lang_region): for i, article in tqdm(enumerate(articles)): - title = article['title'].replace('\r', '').replace('\n', '').replace('"', '') - description = article['description'].replace('\r', '').replace('\n', '').replace('"', '') - publish_time = article['publish_time'] - publisher_id = article['publisher_id'] - - with open(config.OUTPUT_DIR + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), "a") as f: - f.write('"' + '","'.join([title, description, publish_time, publisher_id]) + '"\n') + title = article["title"].replace("\r", "").replace("\n", "").replace('"', "") + description = ( + article["description"].replace("\r", "").replace("\n", "").replace('"', "") + ) + publish_time = article["publish_time"] + publisher_id = article["publisher_id"] + + with open( + config.OUTPUT_DIR + + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), + "a", + ) as f: + f.write( + '"' + + '","'.join([title, description, publish_time, publisher_id]) + + '"\n' + ) for lang_region, model in config.LANG_REGION_MODEL_MAP: logger.info(f"Starting feeds accumulator for {lang_region}") - feed_file = f'{config.FEED_JSON_FILE.format(LANG_REGION=lang_region)}.json' + feed_file = f"{config.FEED_JSON_FILE.format(LANG_REGION=lang_region)}.json" pathlib.Path(config.OUTPUT_DIR).mkdir(parents=True, exist_ok=True) if not config.NO_DOWNLOAD: download_file(feed_file, config.PUB_S3_BUCKET, f"brave-today/{feed_file}") - download_file(config.OUTPUT_DIR + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), - config.PUB_S3_BUCKET, - f"source-suggestions/{config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region)}") + download_file( + config.OUTPUT_DIR + + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), + config.PUB_S3_BUCKET, + f"source-suggestions/{config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region)}", + ) with open(feed_file) as feeds: feeds_data = json.loads(feeds.read()) @@ -51,8 +69,11 @@ def accumulate_articles(articles, lang_region): sanitize_articles_history(lang_region) if not config.NO_UPLOAD: - upload_file(config.OUTPUT_DIR + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), - config.PUB_S3_BUCKET, - f"source-suggestions/{config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region)}") + upload_file( + config.OUTPUT_DIR + + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), + config.PUB_S3_BUCKET, + f"source-suggestions/{config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region)}", + ) logger.info("Finished sanitizing articles_history.") diff --git a/source-similarity-matrix.py b/source-similarity-matrix.py index b239163..2019840 100644 --- a/source-similarity-matrix.py +++ b/source-similarity-matrix.py @@ -9,28 +9,39 @@ from tqdm import tqdm import config -from embeddings import (EMBEDDING_DIMENSIONALITY, - compute_source_representation_from_articles, - compute_source_similarity) -from utils import (clean_source_similarity_file, download_file, - get_source_id_for_title, upload_file) +from embeddings import ( + EMBEDDING_DIMENSIONALITY, + compute_source_representation_from_articles, + compute_source_similarity, +) +from utils import ( + clean_source_similarity_file, + download_file, + get_source_id_for_title, + upload_file, +) logger = get_logger() # Compute similarity matrix for all existing LANG_REGION pairs -for lang_region, model_name in config.LANG_REGION_MODEL_MAP: +for lang_region, model_name in config.LANG_REGION_MODEL_MAP: # noqa: C901 logger.info( - f"Started computing similarity matrix for {lang_region} using {model_name}") + f"Started computing similarity matrix for {lang_region} using {model_name}" + ) pathlib.Path(config.OUTPUT_DIR).mkdir(parents=True, exist_ok=True) if not config.NO_DOWNLOAD: - download_file(config.OUTPUT_DIR + "/" + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), - config.PUB_S3_BUCKET, - f"source-suggestions/{config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region)}") + download_file( + config.OUTPUT_DIR + + "/" + + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), + config.PUB_S3_BUCKET, + f"source-suggestions/{config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region)}", + ) - sources_file = f'{config.SOURCES_JSON_FILE.format(LANG_REGION=lang_region)}.json' + sources_file = f"{config.SOURCES_JSON_FILE.format(LANG_REGION=lang_region)}.json" if not config.NO_DOWNLOAD: download_file(sources_file, config.PUB_S3_BUCKET, sources_file) @@ -41,9 +52,13 @@ sources_df = pd.json_normalize(sources_data) sources_df["source_representation"] = np.nan - articles_df = pd.read_csv(config.OUTPUT_DIR + '/' + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), - header=None) - articles_df.columns = ['title', 'description', 'timestamp', 'publisher_id'] + articles_df = pd.read_csv( + config.OUTPUT_DIR + + "/" + + config.ARTICLE_HISTORY_FILE.format(LANG_REGION=lang_region), + header=None, + ) + articles_df.columns = ["title", "description", "timestamp", "publisher_id"] logger.info("Loading Embedding Model...") model = SentenceTransformer(model_name) @@ -58,17 +73,23 @@ reprs = np.zeros((publisher_ids.size, EMBEDDING_DIMENSIONALITY)) for i, publisher_id in tqdm(enumerate(publisher_ids)): reprs[i, :] = compute_source_representation_from_articles( - articles_df, publisher_id, model) + articles_df, publisher_id, model + ) if not reprs[i, :].any(): logger.warning( - f"Source {sources_df[sources_df.publisher_id == publisher_id].publisher_name.item()} has no articles. Skipping...") + f"Source {sources_df[sources_df.publisher_id == publisher_id].publisher_name.item()} " + f"has no articles. Skipping..." + ) logger.info(f"Computing sources representations for {lang_region}") - sources_representation = pd.DataFrame({'publisher_id': publisher_ids}) + sources_representation = pd.DataFrame({"publisher_id": publisher_ids}) sources_representation = pd.concat( - [sources_representation, pd.DataFrame(reprs)], axis=1) + [sources_representation, pd.DataFrame(reprs)], axis=1 + ) sources_representation.to_csv( - f'output/{config.SOURCE_EMBEDDINGS.format(LANG_REGION=lang_region)}.csv', header=None) + f"output/{config.SOURCE_EMBEDDINGS.format(LANG_REGION=lang_region)}.csv", + header=None, + ) logger.info("Finished building source embeddings.") # For each source pair, compute pair similarity @@ -105,35 +126,58 @@ if sources_ranking: top_similarity_score = sources_ranking[0][1] similarity_cutoff = config.SIMILARITY_CUTOFF_RATIO * top_similarity_score - top10_dictionary[source_id] = [{'source': get_source_id_for_title(source[0], sources_df), 'score': source[1]} - for - source in sources_ranking[:10] if source[1] > similarity_cutoff] - top10_dictionary_human_readable[feed] = [{'source': source[0], 'score': source[1]} for source in - sources_ranking[:10] if source[1] > similarity_cutoff] + top10_dictionary[source_id] = [ + { + "source": get_source_id_for_title(source[0], sources_df), + "score": source[1], + } + for source in sources_ranking[:10] + if source[1] > similarity_cutoff + ] + top10_dictionary_human_readable[feed] = [ + {"source": source[0], "score": source[1]} + for source in sources_ranking[:10] + if source[1] > similarity_cutoff + ] logger.info("Removing un-matched sources") - top10_dictionary = clean_source_similarity_file( - sources_data, top10_dictionary) + top10_dictionary = clean_source_similarity_file(sources_data, top10_dictionary) logger.info("Outputting sources similarities files") - with open(f'output/{config.SOURCE_SIMILARITY_T10.format(LANG_REGION=lang_region)}.json', 'w') as f: + with open( + f"output/{config.SOURCE_SIMILARITY_T10.format(LANG_REGION=lang_region)}.json", + "w", + ) as f: json.dump(top10_dictionary, f) - with open(f'output/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json', 'w') as f: + with open( + f"output/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json", + "w", + ) as f: json.dump(top10_dictionary_human_readable, f) logger.info("Script has finished running.") if not config.NO_UPLOAD: - upload_file(config.OUTPUT_DIR + "/" + f'/{config.SOURCE_SIMILARITY_T10.format(LANG_REGION=lang_region)}.json', - config.PUB_S3_BUCKET, - f"source-suggestions/{config.SOURCE_SIMILARITY_T10.format(LANG_REGION=lang_region)}.json") + upload_file( + config.OUTPUT_DIR + + "/" + + f"/{config.SOURCE_SIMILARITY_T10.format(LANG_REGION=lang_region)}.json", + config.PUB_S3_BUCKET, + f"source-suggestions/{config.SOURCE_SIMILARITY_T10.format(LANG_REGION=lang_region)}.json", + ) upload_file( - config.OUTPUT_DIR + "/" + - f'/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json', + config.OUTPUT_DIR + + "/" + + f"/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json", config.PUB_S3_BUCKET, - f"source-suggestions/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json") + f"source-suggestions/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json", + ) - upload_file(config.OUTPUT_DIR + "/" + f'/{config.SOURCE_EMBEDDINGS.format(LANG_REGION=lang_region)}.csv', - config.PUB_S3_BUCKET, - f"source-suggestions/{config.SOURCE_EMBEDDINGS.format(LANG_REGION=lang_region)}.csv") + upload_file( + config.OUTPUT_DIR + + "/" + + f"/{config.SOURCE_EMBEDDINGS.format(LANG_REGION=lang_region)}.csv", + config.PUB_S3_BUCKET, + f"source-suggestions/{config.SOURCE_EMBEDDINGS.format(LANG_REGION=lang_region)}.csv", + ) diff --git a/utils.py b/utils.py index 06c98f9..51940f6 100644 --- a/utils.py +++ b/utils.py @@ -2,13 +2,12 @@ import mimetypes import boto3 -import numpy as np from botocore.exceptions import ClientError import config boto_session = boto3.Session() -s3_client = boto_session.client('s3') +s3_client = boto_session.client("s3") class InvalidS3Bucket(Exception): @@ -19,12 +18,17 @@ def upload_file(file_name, bucket, object_name=None): if object_name is None: object_name = file_name try: - content_type = mimetypes.guess_type(file_name)[0] or 'binary/octet-stream' - s3_client.upload_file(file_name, bucket, object_name, ExtraArgs={ - 'GrantRead': f'id={config.BRAVE_TODAY_CLOUDFRONT_CANONICAL_ID}', - 'GrantFullControl': f'id={config.BRAVE_TODAY_CANONICAL_ID}', - 'ContentType': content_type - }) + content_type = mimetypes.guess_type(file_name)[0] or "binary/octet-stream" + s3_client.upload_file( + file_name, + bucket, + object_name, + ExtraArgs={ + "GrantRead": f"id={config.BRAVE_TODAY_CLOUDFRONT_CANONICAL_ID}", + "GrantFullControl": f"id={config.BRAVE_TODAY_CANONICAL_ID}", + "ContentType": content_type, + }, + ) except ClientError as e: logging.error(e)