Skip to content

Commit a580dce

Browse files
authored
Merge pull request #7 from Canadian-Geospatial-Platform/dev
v1-1-1-release
2 parents 82726b9 + 0312322 commit a580dce

9 files changed

Lines changed: 146 additions & 80 deletions

File tree

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Use a base image provided by AWS for Python 3.9 runtime
2+
FROM public.ecr.aws/lambda/python:3.9-x86_64
3+
4+
# Set the working directory
5+
WORKDIR /var/task
6+
7+
# Copy requirements.txt
8+
COPY requirements.txt ./
9+
10+
# Copy function code
11+
COPY app.py ./
12+
13+
# Install the specified packages
14+
RUN pip install -r requirements.txt
15+
16+
# Download the NLTK 'punkt' tokenizer and 'stopwords' data
17+
RUN python -m nltk.downloader -d /var/task/nltk_data punkt stopwords
18+
19+
# Set the NLTK_DATA environment variable to use the included data
20+
ENV NLTK_DATA=/var/task/nltk_data
21+
22+
# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
23+
CMD [ "app.lambda_handler" ]
24+

aws-lambda/Preprocessing_lambda/app.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,31 +8,39 @@
88
import nltk
99
from nltk.corpus import stopwords # module for stop words that come with NLTK
1010
from nltk.stem import PorterStemmer # module for stemming
11-
from nltk.tokenize import word_tokenize # module for tokenizing strings
12-
nltk.download('punkt')
13-
nltk.download('stopwords')
11+
from nltk.tokenize import word_tokenize # module for tokenizing strings
12+
13+
#nltk.download('punkt')
14+
#nltk.download('stopwords')
1415

1516
import re
1617

17-
# Environment variables
18+
# environment variables for lambda
19+
file_name = os.environ['FILE_NAME']
20+
bucket_name = os.environ['BUCKET_NAME']
21+
bucket_name_nlp = os.environ['BUCKET_NAME_NLP']
22+
23+
"""
24+
#dev setting -- comment out for release
1825
file_name = "records.parquet"
1926
bucket_name = "webpresence-geocore-geojson-to-parquet-dev"
2027
bucket_name_nlp='nlp-data-preprocessing'
21-
22-
selected_var = ['features_properties_id', 'features_properties_title_en', 'features_properties_title_en','features_properties_description_en','features_properties_keywords_en']
28+
"""
2329

2430
def lambda_handler(event, context):
31+
2532
#Change directory to /tmp folder
2633
os.chdir('/tmp') #This is important
34+
"""
2735
#Make a directory
2836
if not os.path.exists(os.path.join('mydir')):
2937
os.makedirs('mydir')
30-
38+
"""
3139
df = open_S3_file_as_df(bucket_name, file_name)
3240
print(f'The shape of the raw metadata parquet dataset is {df.shape}')
3341

3442
# Select key columns, currently only english
35-
df_en = df[selected_var]
43+
df_en = df[['features_properties_id', 'features_properties_title_en','features_properties_title_fr','features_properties_description_en','features_properties_keywords_en']]
3644
# Replace NaN and "Not Available; Indisponible" with empty string
3745
print("The NaN values in the English columns are \n")
3846
df_en = df_en.fillna('')
@@ -58,7 +66,7 @@ def lambda_handler(event, context):
5866
#print(duplicateRowsDF['features_properties_id'].unique())
5967

6068
# Save to temp folder, see https://iotespresso.com/temporary-storage-during-aws-lambda-runtime-python/
61-
save_path = os.path.join(os.getcwd(), 'mydir', 'duplicateRowsDF')
69+
save_path = os.path.join(os.getcwd(), 'duplicateRowsDF')
6270
duplicateRowsDF.to_csv(save_path)
6371
df_fetched= pd.read_csv(save_path)
6472

aws-lambda/Preprocessing_lambda/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ pandas
22
nltk
33
pyarrow
44
fastparquet
5-
requests
5+
requests
6+
urllib3<2
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
# Build Cloud Formation Zip Package for Similarity Engine using Data Preprocessing
3+
# Author: Xinli Cai
4+
# Date: August 29, 2023
5+
6+
7+
# Navigate to the directory containing `requirements.txt`
8+
cd similarity-engine-data-process/
9+
10+
# Install required packages
11+
pip install -t similarity-engine-data-preprocess-20230822-2200/ -r requirements.txt
12+
13+
# Change to the build directory
14+
cd similarity-engine-data-preprocess-20230822-2200/
15+
16+
# Zip the necessary files
17+
zip -r similarity-engine-data-preprocess-20230822-2200.zip ../app.py ../__init.py__ ./*
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Use a base image provided by AWS for Python 3.9 runtime
2+
FROM public.ecr.aws/lambda/python:3.9-x86_64
3+
4+
# Set the working directory
5+
WORKDIR /var/task
6+
7+
# Copy requirements.txt
8+
COPY requirements.txt ./
9+
10+
# Copy function code
11+
COPY app.py ./
12+
COPY dynamodb.py ./
13+
14+
# Install the specified packages
15+
RUN pip install -r requirements.txt
16+
17+
# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
18+
CMD [ "app.lambda_handler" ]
19+

aws-lambda/Word2Vec_lambda/README.md

Lines changed: 0 additions & 6 deletions
This file was deleted.

aws-lambda/Word2Vec_lambda/app.py

Lines changed: 38 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,54 @@
1-
21
import boto3
32
import logging
43
from botocore.exceptions import ClientError
54
import pandas as pd
6-
7-
85
import numpy as np
96
import json
107
import datetime
11-
128
import io
139
import os
1410

1511
from gensim.models import Word2Vec
1612
from sklearn.metrics.pairwise import cosine_similarity
1713
from gensim import matutils
18-
19-
from tqdm import tqdm
2014
from dynamodb import *
2115

2216

23-
#dev setting
17+
# environment variables for lambda
18+
file_name = os.environ['FILE_NAME']
19+
file_name_origianl = os.environ['FILE_NAME_ORIGINAL']
20+
bucket_name_nlp = os.environ['BUCKET_NAME_NLP']
21+
bucket_name = os.environ['BUCKET_NAME']
22+
23+
"""
24+
#dev setting -- comment out for release
2425
file_name = "Processed_records.parquet"
2526
bucket_name_nlp = "nlp-data-preprocessing"
2627
file_name_origianl = "records.parquet"
2728
bucket_name = "webpresence-geocore-geojson-to-parquet-dev"
29+
"""
2830

2931
def lambda_handler(event, context):
32+
"""
3033
#Change directory to /tmp folder, this is required if new files are created for lambda
3134
os.chdir('/tmp') #This is important
3235
#Make a directory
3336
if not os.path.exists(os.path.join('mydir')):
3437
os.makedirs('mydir')
35-
38+
"""
3639
# Read the preprocessed data from S3
37-
df_en = open_S3_file_as_df(bucket_name_nlp, file_name)
40+
try:
41+
df_en = open_S3_file_as_df(bucket_name_nlp, file_name)
42+
except ClientError as e:
43+
print('Accessing the S3 was failed on line 47 when calling df_en = open_S3_file_as_df(bucket_name_nlp, file_name)')
44+
print(e.response['Error']['Message'])
3845

39-
# Get a sample of 500 rows as the training data
40-
df = df_en[['features_properties_id', 'features_properties_title_en', 'metadata_en_processed']]
41-
#df = df.sample(n=500, random_state=1)
42-
# Use all data to train the model
43-
df.head()
44-
print(df.shape)
45-
46-
4746
# Use all data to train the model
4847
df = df_en[['features_properties_id', 'features_properties_title_en', 'features_properties_title_fr','metadata_en_processed']]
4948
print(f'The shape of the preprocessed df is {df.shape}')
49+
# Replace the missing value in the 'features_properties_title_en' column with an empty string
50+
df['features_properties_title_en'].fillna('', inplace=True)
51+
5052

5153
# Prepare the input for the Word2Vec model
5254
sentences = df['metadata_en_processed'].apply(lambda x: x.split(' ')).tolist()
@@ -58,55 +60,21 @@ def lambda_handler(event, context):
5860

5961
# Convert each sentence in 'metadata_preprocessed' into a vector
6062
vectors = df['metadata_en_processed'].apply(sentence_to_vector, model=model)
61-
# Replace the missing value in the 'features_properties_title_en' column with an empty string
62-
df['features_properties_title_en'].fillna('', inplace=True)
63+
6364

6465
# Calculate similarity between each vector and all others
6566
similarity_matrix = cosine_similarity(np.array(vectors.tolist()))
6667

67-
68-
# Initialize new columns for the top 5 similar texts
69-
df['sim1'], df['sim2'], df['sim3'], df['sim4'], df['sim5'] = "", "", "", "", ""
70-
71-
# For each text, find the top 5 most similar texts and append their 'features_properties_title_en' as new columns
72-
df.reset_index(drop=True, inplace=True)
73-
for i in tqdm(range(similarity_matrix.shape[0])):
74-
top_5_similar = np.argsort(-similarity_matrix[i, :])[1:6] # Exclude the text itself
75-
df.loc[i, ['sim1', 'sim2', 'sim3', 'sim4', 'sim5']] = df.loc[top_5_similar, 'features_properties_title_en'].values
76-
77-
# Read the original parquet file and merge by features_properties_id
78-
df_original = open_S3_file_as_df(bucket_name, file_name_origianl)
79-
merged_df = df_original.merge(df[['features_properties_id', 'sim1', 'sim2', 'sim3', 'sim4', 'sim5']], on='features_properties_id', how='left')
80-
""" Option 1: merge the similar results with records.parquet directly
81-
# Initialize new columns for the top 10 similar texts
82-
df['sim1'], df['sim2'], df['sim3'], df['sim4'], df['sim5'],df['sim6'], df['sim7'], df['sim8'], df['sim9'], df['sim10'] = "", "", "", "", "","", "", "", "", ""
8368

84-
# For each text, find the top 10 most similar texts and append their 'features_properties_title_en' as new columns
85-
df.reset_index(drop=True, inplace=True)
86-
for i in tqdm(range(similarity_matrix.shape[0])):
87-
top_10_similar = np.argsort(-similarity_matrix[i, :])[1:11] # Exclude the text itself
88-
df.loc[i, ['sim1', 'sim2', 'sim3', 'sim4', 'sim5','sim6', 'sim7', 'sim8', 'sim9', 'sim10']] = df.loc[top_10_similar, 'features_properties_id'].values
89-
90-
# Read the original parquet file and merge by features_properties_id
91-
df_original = open_S3_file_as_df(bucket_name, file_name_origianl)
92-
merged_df = df_original.merge(df[['features_properties_id', 'sim1', 'sim2', 'sim3', 'sim4', 'sim5','sim6', 'sim7', 'sim8', 'sim9', 'sim10']],
93-
on='features_properties_id', how='left')
94-
print(f'the shape of original parquet file is {df_original.shape}')
95-
96-
# Save to temp folder, see https://iotespresso.com/temporary-storage-during-aws-lambda-runtime-python/
97-
save_path = os.path.join(os.getcwd(), 'mydir', 'merged_df')
98-
merged_df.to_csv(save_path)
99-
df_fetched= pd.read_csv(save_path)
100-
print(f'the shape of merged parquet file is {merged_df.shape}')
101-
# upload merged dataframe to S3
102-
upload_dataframe_to_s3_as_parquet(df=df_fetched, bucket_name=bucket_name_nlp, file_key='sim_word2vec_records.parquet')
103-
"""
69+
# Upload the similar results as a AWS dynamodb
70+
"""
71+
The parquet lambda function has been modified to merge the similairy table with records.parquet everytime when records.parquet is updated.
10472
105-
#Option 2: upload the similar results as a dynamodb, and merge the tabke with records.parquet everytime when records.parquet is updated
73+
"""
10674
df['similarity'] = np.nan # Initialize the column
10775
# For each text, find the top 10 most similar texts and save them as a JSON array object in the 'similarity' column
10876
df.reset_index(drop=True, inplace=True)
109-
for i in tqdm(range(similarity_matrix.shape[0])):
77+
for i in range(similarity_matrix.shape[0]):
11078
top_10_similar = np.argsort(-similarity_matrix[i, :])[1:11] # Exclude the text itself
11179
sim_array = []
11280
for j, idx in enumerate(top_10_similar):
@@ -137,7 +105,6 @@ def lambda_handler(event, context):
137105
delete_table(TableName='similarity')
138106
waiter = client.get_waiter('table_not_exists')
139107
waiter.wait(TableName='similarity')
140-
print('Before create')
141108
except ClientError as e:
142109
print(e)
143110
#Create table
@@ -148,8 +115,20 @@ def lambda_handler(event, context):
148115
waiter.wait(TableName='similarity')
149116
except ClientError as e:
150117
print(e)
118+
119+
"""DEBUG
120+
#Check if empty string in the primary key before scan the table
121+
empty_string_rows = df[df['features_properties_id'] == '']
122+
print(f'Number of NA values in the df id column is \n {empty_string_rows}')
123+
"""
124+
#Remove rows with empty string in 'features_properties_id', primary key can not be empty in DynamoDB table
125+
df_cleaned = df[df['features_properties_id']!='']
126+
rows_removed = df.shape[0] - df_cleaned.shape[0]
127+
print(f'Removed {rows_removed} rows with empyt string in features_properties_id')
151128
#Batch write to table
152-
batch_write_items_into_table(df, TableName='similarity')
129+
batch_write_items_into_table(df_cleaned, TableName='similarity')
130+
131+
153132

154133
# Function to read the parquet file as pandas dataframe
155134
def open_S3_file_as_df(bucket_name, file_name):
Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
gensim==4.1.2
2-
scikit-learn==1.0
3-
pyarrow
4-
fastparquet
5-
tqdm
1+
pandas
2+
numpy
3+
scikit-learn
4+
transformers
5+
torch
6+
pyarrow
7+
fastparquet
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
# Build Cloud Formation Package for Similarity Engine using Word2Vec
3+
# Author: Xinli Cai
4+
# Date: August 29, 2023
5+
6+
7+
# Cloud Formation Package for Similarity Engine using Word2Vec
8+
9+
# Step 1: Navigate to the directory where the `requirements.txt` is
10+
cd similarity-engine-word2vec-model-dev/
11+
12+
# Step 2: Install the required packages
13+
pip install -t similarity-engine-word2vec-model-build/ -r requirements.txt
14+
15+
# Step 3: Change to the build directory
16+
cd similarity-engine-word2vec-model-build/
17+
18+
# Step 4: Zip the necessary files
19+
zip -r similarity-engine-word2vec-model.zip ../app.py ../dynamodb.py ../__init.py__ ./*
20+
21+
# Restore to the initial directory
22+
cd ..

0 commit comments

Comments
 (0)