diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2acd8b1..bba386e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,7 +8,7 @@ jobs: strategy: max-parallel: 1 matrix: - python-version: [3.8] + python-version: [3.7, 3.8, 3.9] steps: - uses: actions/checkout@v1 diff --git a/.github/workflows/cov.yml b/.github/workflows/cov.yml index d9f6909..a2c4c0c 100644 --- a/.github/workflows/cov.yml +++ b/.github/workflows/cov.yml @@ -19,10 +19,7 @@ jobs: run: | pip install pytest pip install pytest-cov - pip install requests - pip install selenium - pip install progress - pip install pandas + pip install . pytest --cov=./ --cov-report=xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 diff --git a/pull_fb/auth.py b/pull_fb/auth.py new file mode 100644 index 0000000..cb90099 --- /dev/null +++ b/pull_fb/auth.py @@ -0,0 +1,33 @@ +import requests +import browser_cookie3 + + +def get_auth_cookies(): + + print(u"\U0001f512" + " Getting authentication cookies...") + + return browser_cookie3.load(domain_name=".facebook.com") + + +def check_auth(cookies): + + login_url = "https://partners.facebook.com/data_for_good/" + + r = requests.get(login_url, cookies=cookies) + + check_auth_headers(r.headers, login_url) + + +def check_auth_headers(headers, login_url): + + if 'x-fb-rlafr' in headers.keys(): + + print(u"\U00002705" + " Authenticated.") + + return True + + else: + + print(u"\U0000274c" + f" Not authenticated. You must log in to {login_url} in your default browser.") + + return False diff --git a/pull_fb/collection.py b/pull_fb/collection.py new file mode 100644 index 0000000..216b45f --- /dev/null +++ b/pull_fb/collection.py @@ -0,0 +1,141 @@ +import os +import requests +import zipfile +import glob +from datetime import datetime +import re + + +def get_outfn(dataset_id, cwd=os.getcwd()): + + out_fn = cwd + "/" + dataset_id + ".csv.zip" + + return out_fn + + +def write_zipfile(out_fn, data): + + try: + + print(u"\U0001f4e5" + " Writing data...") + + with open(out_fn, 'wb') as fd: + for chunk in data: + fd.write(chunk) + + except Exception: + + raise Exception("Failed to write output zipfile.") + + +def unzip_data(out_fn, out_dir=os.getcwd()): + + print(u"\U0001f4a5" + " Extracting data...") + + try: + + with zipfile.ZipFile(out_fn, 'r') as zip_ref: + zip_ref.extractall(out_dir) + + except Exception: + + raise Exception("Failed to extract files.") + + +def get_file_dataset_ids(files: list): + + try: + + dataset_ids = [x.split("/")[-1].split("_")[0] for x in files] + + except Exception: + + raise Exception("Unable to parse dataset ids.") + + return dataset_ids + + +def get_file_dates(files: list): + + try: + + dates = [x.split("/")[-1].split("_")[1].replace(".csv", "") + for x in files] + + dates = [datetime.strptime(x, "%Y-%m-%d") for x in dates] + + except Exception: + + raise Exception("Unable to parse dates.") + + return dates + + +def set_file_dataset_ids(files, dataset_id): + + print(u"\U0001f4c4" + " Renaming files...") + + for file in files: + + new_fn = re.sub( + r"\d{15}(_\d{4}-\d{2}-\d{2}_\d{4}.csv)", + rf"{dataset_id}\1", + file) + + os.rename(file, new_fn) + + +def request_data(dataset_id, start_date, end_date, cookies): + + try: + + url = "https://partners.facebook.com/data_for_good/bulk_download/?" + query = f"resource_type=downloadable_csv&start_date={start_date}&end_date={end_date}&dataset_id={dataset_id}" + + print(u"\U0001f30e" + f" Trying {url + query}...") + + r = requests.get(url + query, + cookies=cookies) + + except Exception: + + raise Exception("Unable to request data.") + + return r + + +def download_data(dataset_id, start_date, end_date, cookies): + + r = request_data(dataset_id, start_date, end_date, cookies) + + out_fn = get_outfn(dataset_id) + + write_zipfile(out_fn, r.iter_content(chunk_size=128)) + + unzip_data(out_fn) + + os.remove(out_fn) + + files = glob.glob(os.getcwd() + "/*.csv") + + set_file_dataset_ids(files, dataset_id) + + print(u"\U0001f389" + f" Done! Collection size: {len(files)} files.") + + +def get_update_config(): + + files = glob.glob(os.getcwd() + "/*.csv") + + dataset_ids = get_file_dataset_ids(files) + dates = get_file_dates(files) + + start_date = datetime.strftime(max(dates), "%Y-%m-%d") + end_date = datetime.strftime(datetime.now(), "%Y-%m-%d") + dataset_id = dataset_ids[0] + + return { + "start_date": start_date, + "end_date": end_date, + "dataset_id": dataset_id + } diff --git a/pull_fb/credentials.py b/pull_fb/credentials.py deleted file mode 100644 index ca6ad4a..0000000 --- a/pull_fb/credentials.py +++ /dev/null @@ -1,22 +0,0 @@ -from getpass import getpass - - -def get_credentials(username, password): - """ - Prompt for Facebook login get_credentials - - Future: add option to cache encrypted credentials - """ - - # Prompt for username - if username is None: - - username = input("Email: ") - - # Prompt for password - if password is None: - - password = getpass("Password: ") - - # Return dictionary of username and password - return {"email": username, "password": password} diff --git a/pull_fb/driver.py b/pull_fb/driver.py deleted file mode 100644 index 5516b5e..0000000 --- a/pull_fb/driver.py +++ /dev/null @@ -1,164 +0,0 @@ -import time -import requests -from datetime import datetime -from selenium import webdriver -from progress.bar import Bar -from io import StringIO -import pandas as pd - - -def authenticate_driver(keys: dict, - driver_path: str, - driver_flags: list, - driver_prefs: dict): - - print('Authenticating webdriver...') - - # Define options for web driver - chrome_options = webdriver.ChromeOptions() - - # Apply preferences to chrome driver - chrome_options.add_experimental_option("prefs", driver_prefs) - - # Add individual flags to chromedriver prefs - for flag in driver_flags: - - chrome_options.add_argument(flag) - - driver = webdriver.Chrome( - executable_path=driver_path, options=chrome_options - ) - - # Login url for Geoinsights platform - geoinsights_url = "https://www.facebook.com/login/?next=https%3A%2F%2Fwww.facebook.com%2Fgeoinsights-portal%2F" - - # Access login url with webdriver - driver.get(geoinsights_url) - - # Pause for page load (and cookie acceptance) - time.sleep(5) - - # Try to accept cookies. On failure, pass - try: - - driver.find_element_by_xpath('//button[@data-cookiebanner="accept_button"]').click() - - except Exception: - - pass - - # Add username in username form field - driver.find_element_by_xpath('//*[@id="email"]').send_keys(keys["email"]) - - # Add password in password form field - driver.find_element_by_xpath('//*[@id="pass"]').send_keys(keys["password"]) - - # Click login button - driver.find_element_by_xpath('//*[@id="loginbutton"]').click() - - time.sleep(1) - - # Get cookies from authenticated web driver - request_cookies_browser = driver.get_cookies() - - driver.quit() - - print('Successfully authenticated webdriver.') - - return(request_cookies_browser) - - -def authenticate_session(request_cookies_browser: list): - - # Create a nes requests session - s = requests.Session() - - # Pass the cookies from the authenticated webdriver to the session - [s.cookies.set(c['name'], c['value']) for c in request_cookies_browser] - - return s - - -def download_data(download_urls: list, - area: str, - outdir: str, - request_cookies_browser: list): - - s = authenticate_session(request_cookies_browser) - - # Start download bar - print("\n") - bar = Bar("Downloading", max=len(download_urls)) - - # Store unsuccessful download file names - download_failed = [] - - # For each download url, download dataset - for i, url in enumerate(download_urls): - - # Request dataset from URL - resp = s.get(url["url"]) - - # Define output file name - out_fn = format_out_fn(outdir, area, url["date"]) - - download_failed = write_outfile(resp, out_fn, download_failed) - - time.sleep(1) - - # Update progress bar - bar.next() - - # Close progress bar - bar.finish() - - if len(download_failed) > 0: - - print('Failed to download {} files. Please try again later.'.format(len(download_failed))) - - -def write_outfile(resp: requests.Response, out_fn: str, download_failed: list): - - if resp.status_code == 200: - - try: - - # try to convert response data to csv with >1 row - data = response_as_dataframe(resp.text) - - # Write response data as csv - data.to_csv(out_fn) - - except Exception: - - # Append failed filename download - download_failed.append(out_fn) - - pass - - return download_failed - - -def response_as_dataframe(text: str): - - data = StringIO(text) - - df = pd.read_csv(data) - - try: - - assert len(df.index) > 1 - - except Exception as e: - - raise e - - return(df) - - -def format_out_fn(outdir: str, area: str, date: datetime): - - # Define new file name as AREA_DATE.csv - new_name = outdir + "/" + area + date.strftime("_%Y_%m_%d_%H%M") + ".csv" - - return(new_name) diff --git a/pull_fb/pull_fb.py b/pull_fb/pull_fb.py index 0810747..64024a9 100644 --- a/pull_fb/pull_fb.py +++ b/pull_fb/pull_fb.py @@ -1,151 +1,77 @@ import click -import os from datetime import datetime -import pull_fb.utils as utils -import pull_fb.url as url -import pull_fb.driver as driver -import pull_fb.credentials as credentials - - -@click.command() -@click.option("-d", "--dataset_name", help="Dataset name to be downloaded.") -@click.option("-a", "--area", help="Area to be downloaded.") -@click.option( - "-o", - "--outdir", - help="Outfile directory. Default: current directory.", - default=os.getcwd(), -) -@click.option( - "-e", - "--end_date", - help="Dataset end date. Default: datetime.now().", - default=datetime.now(), -) -@click.option( - "-f", - "--frequency", - help="Dataset update frequency (hours). Default: 8.", - default=8 -) -@click.option( - "-driver", - "--driver_path", - help="Path to webdriver.", - default="/Applications/chromedriver", -) -@click.option( - "-config", - "--config_path", - help=".config path. Default is requested from the repo, otherwise is read from provided local path or other http connection.", - default="https://raw.githubusercontent.com/hamishgibbs/pull_facebook_data_for_good/master/.config", -) -@click.option( - "-user", - "--username", - help="Facebook username.", - default=None -) -@click.option( - "-pass", - "--password", - help="Facebook password.", - default=None -) -@click.option( - "-driver_flags", - "--driver_flags", - help="Flags passed to chromedriver.", - multiple=True, - default=["--headless"] + +from pull_fb.auth import ( + get_auth_cookies, + check_auth ) -@click.option( - "-driver_prefs", - "--driver_prefs", - help="Preferences passed to chromedriver.", - default={"download.default_directory": os.getcwd()} + +from pull_fb.collection import ( + download_data, + get_update_config, ) -def cli( - dataset_name, - area, - outdir=None, - end_date=None, - frequency=None, - driver_path=None, - config_path=None, - username=None, - password=None, - driver_flags=None, - driver_prefs=None): - """ - Entry point for the pull_fb cli. - - """ - - pull_fb(dataset_name, - area, - outdir, - end_date, - frequency, - driver_path, - config_path, - username, - password, - driver_flags, - driver_prefs) - - -def pull_fb(dataset_name, - area, - outdir: str = os.getcwd(), - end_date: datetime = datetime.now(), - frequency: int = 8, - driver_path: str = "/Applications/chromedriver", - config_path: str = "https://raw.githubusercontent.com/hamishgibbs/pull_facebook_data_for_good/master/.config", - username: str = None, - password: str = None, - driver_flags: list = ["--headless"], - driver_prefs: dict = {"download.default_directory": os.getcwd()}): - - print("Reading dataset configuration...") - # Get config variables from repository - config = utils.get_download_variables(dataset_name, - area, - end_date, - config_path) - - # Get date sequence between start and end dates - data_dates = utils.get_file_dates( - config["start_date"], config["end_date"], frequency + + +@click.group() +def cli(): + pass + + +@click.group() +def auth(): + pass + + +@click.group() +def collection(): + pass + + +@auth.command('status') +def auth_status(): + + cookies = get_auth_cookies() + + check_auth(cookies) + + +@collection.command("init") +@click.option('--dataset_id', required=True) +@click.option('--start_date', required=True) +@click.option('--end_date') +def collection_init(dataset_id, + start_date, + end_date=datetime.strftime(datetime.now(), "%Y-%m-%d")): + + cookies = get_auth_cookies() + + download_data( + dataset_id, + start_date, + end_date, + cookies ) - # Get downloaded dates from outdir - existing_dates = utils.get_existing_dates(outdir, area) - # Only download dates that have not already been downloaded - download_dates = list(set(data_dates).difference(set(existing_dates))) +@collection.command("update") +def collection_update(): - download_dates.sort() + cookies = get_auth_cookies() - # Get url of each of dataset - download_urls = url.format_urls(dataset_name, - config["dataset_id"], - download_dates) + config = get_update_config() - # Get credentials here - keys = credentials.get_credentials(username, password) + download_data( + config["dataset_id"], + config["start_date"], + config["end_date"], + cookies + ) - # Authenticate webdriver - request_cookies_browser = driver.authenticate_driver(keys, - driver_path, - driver_flags, - driver_prefs) - # Download url sequence and move to output directory - driver.download_data(download_urls, - area, - outdir, - request_cookies_browser) +cli.add_command(auth) +cli.add_command(collection) - # Success message - print('Done.') +# add pull_fb collection audit +# to check - no duplicate files +# all files are present in range +# Only one dataset id diff --git a/pull_fb/url.py b/pull_fb/url.py deleted file mode 100644 index b9a7d1f..0000000 --- a/pull_fb/url.py +++ /dev/null @@ -1,37 +0,0 @@ -def format_urls(dataset_name: str, dataset_id: str, download_dates: list): - """Function to format urls with the appropriate format""" - - # Define base urls for each supported dataset - # Move this into a config in the future - base_urls = { - "TileMovement": "https://www.facebook.com/geoinsights-portal/downloads/vector/?id={}&ds={}", - "TilePopulation": "https://www.facebook.com/geoinsights-portal/downloads/raster/?id={}&ds={}" - } - - # Define date formats for download urls of each dataset - date_formats = { - "TileMovement": "%Y-%m-%d+%H%M", - "TilePopulation": "%Y-%m-%d+%H%M" - } - - # Define the appropriate base_url - base_url = base_urls[dataset_name] - - # Define the appropriate date_format - date_format = date_formats[dataset_name] - - # List of download urls - urls = [] - - # For each download date, format a download url and record dataset date - for date in download_dates: - - urls.append( - { - "url": base_url.format(dataset_id, date.strftime(date_format)), - "date": date, - } - ) - - # Return a list of url, date pair dictionaries - return urls diff --git a/pull_fb/utils.py b/pull_fb/utils.py deleted file mode 100644 index 42cdc10..0000000 --- a/pull_fb/utils.py +++ /dev/null @@ -1,159 +0,0 @@ -import requests -import os -import glob -from datetime import datetime, timedelta - - -def get_config(config_path): - """ - Funciton to get configuration file from online repository - """ - - # Try to get config file or raise exception - try: - if config_path.startswith('http'): - r = requests.get(config_path) - config_var = r.text.split("\n")[:-1] - else: - with open(config_path) as f: - r = f.readlines() - config_var = [x.replace("\n", "") for x in r] - - except requests.exceptions.RequestException as e: - - raise SystemExit(e) - - # Extract config variables to dictionary or raise Exception - try: - config = dict(x.split("=") for x in config_var) - - except Exception: - - raise Exception("Malformed .config file.") - - # Return config variables as a dictionary - return(config) - - -def get_download_variables(dataset: str, country: str, end_date: str, config_path: str): - """ - Function to get downlaod variable for a particular dataset from config file - - This could be simplified - """ - - # Get config variables from repository - config = get_config(config_path) - - # Extract dataset id or raise missing dataset error - try: - - dataset_id = config["_".join([country, dataset, "ID"])] - - except Exception: - - raise KeyError( - "No config value for {}. To add a new dataset, see the Readme.".format( - "_".join([country, dataset, "ID"]) - ) - ) - - # Extract dataset origin or raise missing dataset error - try: - - dataset_origin = config["_".join([country, dataset, "Origin"])] - - except Exception: - - raise KeyError( - "No config value for {}. To add a new dataset, see the Readme.".format( - "_".join([country, dataset, "Origin"]) - ) - ) - - # Convert datset origin string to datetime object - dataset_origin = date_str_to_datetime(dataset_origin) - - # Return config variables as dict - return { - "dataset_id": dataset_id, - "start_date": dataset_origin, - "end_date": end_date, - } - - -def date_str_to_datetime(date: str): - """ - Function to parse origin date in the format '%Y_%m_%d_%H' or '%Y_%m_%d' - """ - - # List of recognized date formats - formats = ["%Y_%m_%d_%H%M", "%Y_%m_%d_%H", "%Y_%m_%d"] - - # Try to match formats until one succeeds - for format in formats: - - try: - - # Return datetime object - return datetime.strptime(date, format) - - except ValueError: - - pass - - # Raise ValueError for unknown date format - raise ValueError("Unknown date format.") - - -def get_file_dates(start_date, end_date, frequency): - """ - Function to get date sequence between start_date and end_date with a - given frequency - - This could be replaced by a datetime function - """ - - # List to store dataset dates - data_dates = [] - - # Define start of date list - date = start_date - - # Loop through date range, incrementing by `frequency` hours - while date < end_date: - - data_dates.append(date) - - date = date + timedelta(hours=frequency) - - # Return list of dataset dates - return data_dates - - -def get_existing_dates(outdir: str, area: str): - """ - Function to get dates from files in the outdir - """ - - # Extract file names from csv files in outdir (only for current area) - date_str = [os.path.basename(x) for x in glob.glob(outdir + "/" + area + "_" + "*.csv")] - - # Remove area from file name - date_str = [x.replace(area + "_", "") for x in date_str] - - # Remove extension from file name - date_str = [x.replace(".csv", "") for x in date_str] - - # Convert date string to datetime object - date_str = [date_str_to_datetime(x) for x in date_str] - - # If any existing files are found, notify user - if len(date_str) > 0: - - message = "Found existing collection in output directory ({} files).\nOnly new files will be downloaded." - - print(message.format(str(len(date_str)))) - - # Return a list of the dates of datasets that have already been downloaded - return date_str diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e195370 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[tool.poetry] +name = "pull_fb" +version = "0.1.0" +description = "Imitate an API for downloading data from Facebook Data For Good" +authors = ["hamishgibbs "] +license = "MIT" + +[tool.poetry.dependencies] +python = "^3.7" +requests = "^2.25.1" +browser-cookie3 = "^0.12.1" +click = "^7.1.2" + +[tool.poetry.dev-dependencies] +pytest = "^6.2.4" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry.scripts] +pull_fb="pull_fb.pull_fb:cli" diff --git a/setup.py b/setup.py deleted file mode 100644 index 359a625..0000000 --- a/setup.py +++ /dev/null @@ -1,27 +0,0 @@ -import setuptools - -setuptools.setup( - name="pull_fb", - version="0.0.1", - author="Hamish Gibbs", - author_email="Hamish.Gibbs@lshtm.ac.uk", - description="CLI for downloading data from Facebook data for good.", - url="https://github.com/hamishgibbs/pull_facebook_data_for_good", - packages=setuptools.find_packages(), - install_requires=[ - "Click", - "requests", - "pandas", - "progress" - ], - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - ], - python_requires=">=3.6", - entry_points=""" - [console_scripts] - pull_fb=pull_fb.pull_fb:cli - """, -) diff --git a/tests/test_check_auth_headers.py b/tests/test_check_auth_headers.py new file mode 100644 index 0000000..b363095 --- /dev/null +++ b/tests/test_check_auth_headers.py @@ -0,0 +1,17 @@ +from pull_fb.auth import check_auth_headers + + +def test_check_auth_headers_true(): + + headers = { + "x-fb-rlafr": "test" + } + + assert check_auth_headers(headers, "a") + + +def test_check_auth_headers_false(): + + headers = {} + + assert not check_auth_headers(headers, "a") diff --git a/tests/test_credentials.py b/tests/test_credentials.py deleted file mode 100644 index 0b1658d..0000000 --- a/tests/test_credentials.py +++ /dev/null @@ -1,19 +0,0 @@ -from pull_fb import credentials - - -def test_credentials_filled(): - - res = credentials.get_credentials('a', 'b') - - assert type(res) is dict - - -def test_credentials_usenrame_none(monkeypatch): - - monkeypatch.setattr('builtins.input', lambda _: "example@gmail.com") - - # go about using input() like you normally would: - res = credentials.get_credentials(None, 'b') - - assert type(res) is dict - assert res['email'] == "example@gmail.com" diff --git a/tests/test_driver.py b/tests/test_driver.py deleted file mode 100644 index 39476aa..0000000 --- a/tests/test_driver.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -import pytest -from datetime import datetime -from pull_fb import driver -import requests -import pandas as pd - - -@pytest.fixture() -def sample_csv_response(): - - return 'a,b\n1,2\n3,4' - - -@pytest.fixture() -def mock_csv_response(): - - class Mock_Response(): - - def __init__(self): - - self.status_code = 200 - self.text = 'a,b\n1,2\n3,4' - - return Mock_Response() - - -@pytest.fixture(scope="session") -def tmp_path(tmpdir_factory): - - path = tmpdir_factory.mktemp("tmp") - - return path - - -def test_format_out_fn(): - - res = driver.format_out_fn('a', 'b', datetime(2000, 1, 1, 0)) - - assert res == 'a/b_2000_01_01_0000.csv' - - -def test_response_as_dataframe_reads_csv(sample_csv_response): - - res = driver.response_as_dataframe(sample_csv_response) - - assert type(res) is pd.DataFrame - - -def test_response_as_dataframe_raises_one_row(): - - with pytest.raises(AssertionError): - - driver.response_as_dataframe('a,b\n1,2') - - -def test_response_as_dataframe_fails_html(): - - with pytest.raises(AssertionError): - - driver.response_as_dataframe('
Other stuff
') - - -def test_authenticate_session_with_cookies(): - - request_cookies_browser = [{'name': 'item', 'value': 'item'}] - - res = driver.authenticate_session(request_cookies_browser) - - assert type(res) is requests.Session - - -def test_write_outfile(mock_csv_response): - - res = driver.write_outfile(mock_csv_response, 'test.csv', []) - - assert type(res) is list - - os.remove('test.csv') - - -# test download_data -def test_download_data_tmp_dir(tmp_path): - - download_urls = [{"url": "https://github.com/hamishgibbs/uk_tier_data/raw/master/output/uk_tier_data_parliament_2020_10_25_1606.csv", - "date": datetime(2020, 3, 1, 0)}] - - driver.download_data(download_urls, - "Britain", - str(tmp_path), - []) - - assert os.path.exists(str(tmp_path) + '/Britain_2020_03_01_0000.csv') diff --git a/tests/test_get_file_dataset_ids.py b/tests/test_get_file_dataset_ids.py new file mode 100644 index 0000000..a15010b --- /dev/null +++ b/tests/test_get_file_dataset_ids.py @@ -0,0 +1,20 @@ +import pytest + +from pull_fb.collection import get_file_dataset_ids + + +def test_get_file_dataset_ids(): + + files = ["a/b/c/1_2020-01-01_0000.csv"] + + res = get_file_dataset_ids(files) + + assert res == ["1"] + + +def test_get_file_dataset_ids_raises(): + + files = [None] + + with pytest.raises(Exception): + get_file_dataset_ids(files) diff --git a/tests/test_get_file_dates.py b/tests/test_get_file_dates.py new file mode 100644 index 0000000..b412a38 --- /dev/null +++ b/tests/test_get_file_dates.py @@ -0,0 +1,21 @@ +import pytest +from datetime import datetime + +from pull_fb.collection import get_file_dates + + +def test_get_file_dates_ids(): + + files = ["a/b/c/1_2020-01-01_0000.csv"] + + res = get_file_dates(files) + + assert res == [datetime(2020, 1, 1)] + + +def test_get_file_dates_raises(): + + files = [None] + + with pytest.raises(Exception): + get_file_dates(files) diff --git a/tests/test_get_outfn.py b/tests/test_get_outfn.py new file mode 100644 index 0000000..489c576 --- /dev/null +++ b/tests/test_get_outfn.py @@ -0,0 +1,8 @@ +from pull_fb.collection import get_outfn + + +def test_get_outfn(): + + res = get_outfn("1", "a") + + assert res == "a/1.csv.zip" diff --git a/tests/test_set_file_dataset_ids.py b/tests/test_set_file_dataset_ids.py new file mode 100644 index 0000000..51e333d --- /dev/null +++ b/tests/test_set_file_dataset_ids.py @@ -0,0 +1,23 @@ +import os +from tests.utils import tmpdir +from pull_fb.collection import set_file_dataset_ids + + +def test_set_file_dataset_ids(tmpdir): + + fn = str(tmpdir + "/123456789123456_2020-01-01_0000.csv") + + with open(fn, "w") as f: + f.write("text") + + assert os.path.exists(fn) + + set_file_dataset_ids([fn], "123") + + fn_exp = tmpdir + "/123_2020-01-01_0000.csv" + + assert os.path.exists( + str(fn_exp) + ) + + os.remove(fn_exp) diff --git a/tests/test_unzip_data.py b/tests/test_unzip_data.py new file mode 100644 index 0000000..2dbd9dc --- /dev/null +++ b/tests/test_unzip_data.py @@ -0,0 +1,38 @@ +import os +import pytest +import shutil +from tests.utils import tmpdir + +from pull_fb.collection import unzip_data + + +def test_unzip_data(tmpdir): + + fn = str(tmpdir + "/123456789123456_2020-01-01_0000.csv") + zip_fn = fn + ".zip" + + with open(fn, "w") as f: + f.write("text") + + assert os.path.exists(fn) + + shutil.make_archive(fn, 'zip', tmpdir) + + assert os.path.exists(zip_fn) + + os.remove(fn) + + assert not os.path.exists(fn) + + unzip_data(zip_fn, tmpdir) + + assert os.path.exists(fn) + + +def test_unzip_data_raises(tmpdir): + + fn = str(tmpdir + "/123456789123456_2020-01-01_0000.csv") + zip_fn = fn + ".zip" + + with pytest.raises(Exception): + unzip_data(zip_fn, tmpdir) diff --git a/tests/test_url.py b/tests/test_url.py deleted file mode 100644 index 919af62..0000000 --- a/tests/test_url.py +++ /dev/null @@ -1,40 +0,0 @@ -import pytest -from datetime import datetime -from pull_fb import url - - -@pytest.fixture -def tilemovement_res(): - - return url.format_urls('TileMovement', '123', [datetime(2000, 1, 1)]) - - -@pytest.fixture -def tilepopulation_res(): - - return url.format_urls('TilePopulation', '123', [datetime(2000, 1, 1)]) - - -def test_format_urls_is_list(tilemovement_res): - - assert type(tilemovement_res) is list - - -def test_format_urls_item_is_dict(tilemovement_res): - - assert type(tilemovement_res[0]) is dict - - -def test_format_urls_url_is_str(tilemovement_res): - - assert type(tilemovement_res[0]['url']) is str - - -def test_format_urls_url_tilemovement_has_vector(tilemovement_res): - - assert 'vector' in tilemovement_res[0]['url'] - - -def test_format_urls_url_tilepopulation_has_raster(tilepopulation_res): - - assert 'raster' in tilepopulation_res[0]['url'] diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index bb8bb95..0000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,201 +0,0 @@ -import pytest -from pull_fb import utils -from datetime import datetime - - -@pytest.fixture -def example_date_config(): - return { - "start_date": datetime(2020, 1, 1), - "end_date": datetime(2020, 1, 2), - "frequency": 8, - } - - -@pytest.fixture(scope="session") -def local_config_file(tmpdir_factory): - - fn = tmpdir_factory.mktemp("tmp").join(".config") - - with open(fn, 'w') as f: - - f.write('Britain_TileMovement_ID=1671212783027520\nBritain_TileMovement_Origin=2020_03_10_0') - - return fn - - -@pytest.fixture(scope="session") -def local_config_file_missing_id(tmpdir_factory): - - fn = tmpdir_factory.mktemp("tmp").join(".config") - - with open(fn, 'w') as f: - - f.write('Britain_TileMovement_Origin=2020_03_10_0') - - return fn - - -@pytest.fixture(scope="session") -def local_config_file_missing_origin(tmpdir_factory): - - fn = tmpdir_factory.mktemp("tmp").join(".config") - - with open(fn, 'w') as f: - - f.write('Britain_TileMovement_ID=1671212783027520') - - return fn - - -@pytest.fixture(scope="session") -def local_config_file_malformed(tmpdir_factory): - - fn = tmpdir_factory.mktemp("tmp").join(".config") - - with open(fn, 'w') as f: - - f.write('Britain_Colocation_ID=229180671540661Britain_Colocation_Origin=2020_02_11Britain_TilePopulation_ID=881889318900484') - - return fn - - -@pytest.fixture(scope="session") -def output_csv(tmpdir_factory): - - fn = tmpdir_factory.mktemp("tmp").join("Britain_2020_06_05_1600.csv") - - with open(fn, 'w') as f: - - f.write('test') - - return fn - - -# Test date_str_to_datetime -def test_date_str_to_datetime_hours(): - - s = "2020_04_30_16" - - res = utils.date_str_to_datetime(s) - - assert type(res) is datetime - - -def test_date_str_to_datetime_days(): - - s = "2020_04_30" - - res = utils.date_str_to_datetime(s) - - assert type(res) is datetime - - -def test_date_str_to_datetime_errors(): - - s = "not a date" - - with pytest.raises(ValueError): - - utils.date_str_to_datetime(s) - - -# Test get_file_dates -def test_get_file_dates_8h(example_date_config): - - res = utils.get_file_dates( - example_date_config["start_date"], example_date_config["end_date"], 8 - ) - - assert len(res) == 3 - - -def test_get_file_dates_12h(example_date_config): - - res = utils.get_file_dates( - example_date_config["start_date"], example_date_config["end_date"], 12 - ) - - assert len(res) == 2 - - -def test_get_file_dates_type(example_date_config): - - res = utils.get_file_dates( - example_date_config["start_date"], example_date_config["end_date"], 12 - ) - - assert type(res[0]) is datetime - - -# test get_config -def test_get_config_remote(): - - path = "https://raw.githubusercontent.com/hamishgibbs/pull_facebook_data_for_good/master/.config" - - res = utils.get_config(path) - - assert type(res) is dict - - -def test_get_config_local(local_config_file): - - res = utils.get_config(str(local_config_file)) - - assert type(res) is dict - - -def test_get_config_local_raises_malformed(local_config_file_malformed): - - with pytest.raises(Exception): - - utils.get_config(str(local_config_file_malformed)) - - -# test get download variables -def test_get_download_variables_works(local_config_file): - - now = datetime.now() - - res = utils.get_download_variables('TileMovement', - 'Britain', - now, - str(local_config_file)) - - assert type(res) is dict - - assert res['dataset_id'] == '1671212783027520' - - assert res['start_date'] == datetime(2020, 3, 10, 0) - - assert res['end_date'] == now - - -def test_get_download_variables_missing_id(local_config_file_missing_id): - - with pytest.raises(KeyError): - - utils.get_download_variables('TileMovement', - 'Britain', - datetime.now(), - str(local_config_file_missing_id)) - - -def test_get_download_variables_missing_origin(local_config_file_missing_origin): - - with pytest.raises(KeyError): - - utils.get_download_variables('TileMovement', - 'Britain', - datetime.now(), - str(local_config_file_missing_origin)) - - -# test get_existing_dates -def test_get_existing_dates(output_csv): - - outdir = '/'.join(str(output_csv).split('/')[:-1]) - - res = utils.get_existing_dates(outdir, 'Britain') - - assert type(res) is list diff --git a/tests/test_write_zipfile.py b/tests/test_write_zipfile.py new file mode 100644 index 0000000..222c765 --- /dev/null +++ b/tests/test_write_zipfile.py @@ -0,0 +1,24 @@ +import os +import pytest +from tests.utils import tmpdir + +from pull_fb.collection import write_zipfile + + +def test_write_zipfile(tmpdir): + + fn = tmpdir + "/test.zip" + data = [b"a"] + + write_zipfile(fn, data) + + assert os.path.exists(fn) + + +def test_write_zipfile_raises(tmpdir): + + fn = tmpdir + "/test.zip" + data = [None] + + with pytest.raises(Exception): + write_zipfile(fn, data) diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..3a9444e --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,6 @@ +import pytest + +@pytest.fixture(scope="session") +def tmpdir(tmpdir_factory): + tmp = tmpdir_factory.mktemp("data") + return tmp diff --git a/tox.ini b/tox.ini index 97cf870..d85ec16 100644 --- a/tox.ini +++ b/tox.ini @@ -1,10 +1,15 @@ [tox] -envlist = py38 +envlist = py37 py38 py39 +isolated_build = True + +[gh-actions] +python = + 3.7: py37 + 3.8: py38 + 3.9: py39 [testenv] deps = pytest - requests - selenium - pandas + browser_cookie3 commands = pytest