Skip to content
This repository was archived by the owner on Jun 3, 2021. It is now read-only.

Integrate with partner portal #55

Draft
wants to merge 20 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
strategy:
max-parallel: 1
matrix:
python-version: [3.8]
python-version: [3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v1
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/cov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@ jobs:
run: |
pip install pytest
pip install pytest-cov
pip install requests
pip install selenium
pip install progress
pip install pandas
pip install .
pytest --cov=./ --cov-report=xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
Expand Down
33 changes: 33 additions & 0 deletions pull_fb/auth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import requests
import browser_cookie3


def get_auth_cookies():

print(u"\U0001f512" + " Getting authentication cookies...")

return browser_cookie3.load(domain_name=".facebook.com")


def check_auth(cookies):

login_url = "https://partners.facebook.com/data_for_good/"

r = requests.get(login_url, cookies=cookies)

check_auth_headers(r.headers, login_url)


def check_auth_headers(headers, login_url):

if 'x-fb-rlafr' in headers.keys():

print(u"\U00002705" + " Authenticated.")

return True

else:

print(u"\U0000274c" + f" Not authenticated. You must log in to {login_url} in your default browser.")

return False
141 changes: 141 additions & 0 deletions pull_fb/collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import os
import requests
import zipfile
import glob
from datetime import datetime
import re


def get_outfn(dataset_id, cwd=os.getcwd()):

out_fn = cwd + "/" + dataset_id + ".csv.zip"

return out_fn


def write_zipfile(out_fn, data):

try:

print(u"\U0001f4e5" + " Writing data...")

with open(out_fn, 'wb') as fd:
for chunk in data:
fd.write(chunk)

except Exception:

raise Exception("Failed to write output zipfile.")


def unzip_data(out_fn, out_dir=os.getcwd()):

print(u"\U0001f4a5" + " Extracting data...")

try:

with zipfile.ZipFile(out_fn, 'r') as zip_ref:
zip_ref.extractall(out_dir)

except Exception:

raise Exception("Failed to extract files.")


def get_file_dataset_ids(files: list):

try:

dataset_ids = [x.split("/")[-1].split("_")[0] for x in files]

except Exception:

raise Exception("Unable to parse dataset ids.")

return dataset_ids


def get_file_dates(files: list):

try:

dates = [x.split("/")[-1].split("_")[1].replace(".csv", "")
for x in files]

dates = [datetime.strptime(x, "%Y-%m-%d") for x in dates]

except Exception:

raise Exception("Unable to parse dates.")

return dates


def set_file_dataset_ids(files, dataset_id):

print(u"\U0001f4c4" + " Renaming files...")

for file in files:

new_fn = re.sub(
r"\d{15}(_\d{4}-\d{2}-\d{2}_\d{4}.csv)",
rf"{dataset_id}\1",
file)

os.rename(file, new_fn)


def request_data(dataset_id, start_date, end_date, cookies):

try:

url = "https://partners.facebook.com/data_for_good/bulk_download/?"
query = f"resource_type=downloadable_csv&start_date={start_date}&end_date={end_date}&dataset_id={dataset_id}"

print(u"\U0001f30e" + f" Trying {url + query}...")

r = requests.get(url + query,
cookies=cookies)

except Exception:

raise Exception("Unable to request data.")

return r


def download_data(dataset_id, start_date, end_date, cookies):

r = request_data(dataset_id, start_date, end_date, cookies)

out_fn = get_outfn(dataset_id)

write_zipfile(out_fn, r.iter_content(chunk_size=128))

unzip_data(out_fn)

os.remove(out_fn)

files = glob.glob(os.getcwd() + "/*.csv")

set_file_dataset_ids(files, dataset_id)

print(u"\U0001f389" + f" Done! Collection size: {len(files)} files.")


def get_update_config():

files = glob.glob(os.getcwd() + "/*.csv")

dataset_ids = get_file_dataset_ids(files)
dates = get_file_dates(files)

start_date = datetime.strftime(max(dates), "%Y-%m-%d")
end_date = datetime.strftime(datetime.now(), "%Y-%m-%d")
dataset_id = dataset_ids[0]

return {
"start_date": start_date,
"end_date": end_date,
"dataset_id": dataset_id
}
22 changes: 0 additions & 22 deletions pull_fb/credentials.py

This file was deleted.

164 changes: 0 additions & 164 deletions pull_fb/driver.py

This file was deleted.

Loading