diff --git a/README.md b/README.md index ed8da69..d2102ed 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ Package providing simple Python access to data in: * AWS s3 * MySQL * neo4j +* SPARQL Sroka library was checked to work for Python **>=3.8, <=3.11**. diff --git a/Test APIs.ipynb b/Test APIs.ipynb index 1a9e963..cb47aad 100644 --- a/Test APIs.ipynb +++ b/Test APIs.ipynb @@ -18,9 +18,7 @@ }, { "cell_type": "code", - "execution_count": 1, "metadata": {}, - "outputs": [], "source": [ "# GA API\n", "from sroka.api.ga.ga import ga_request\n", @@ -43,10 +41,36 @@ "from sroka.api.s3_connection.s3_connection_api import s3_download_data, s3_upload_data\n", "# MySQL API\n", "from sroka.api.mysql.mysql import query_mysql\n", + "# SPARQL API\n", + "from sroka.api.sparql.sparql import query_sparql\n", "\n", "# data wrangling\n", "import numpy as np" - ] + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "# SPARQL" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "df = query_sparql(\"\"\"SELECT \n", + "\t?game \n", + "\t?gameLabel \n", + "\t?gameTitle\n", + "WHERE {\n", + "\t?game wdt:P31 wd:Q7889 . # instance of video game\n", + "}\n", + "LIMIT 10\"\"\", endpoint_url='https://query.wikidata.org/sparql')\n", + "df" + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -57,16 +81,16 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "df = query_athena(\"\"\"\n", " SELECT '2019-03-01' as date\n", " \"\"\")\n", "\n", "df" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -77,21 +101,19 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "# input a path to data on your s3, it is needed to perform any query\n", "s3_folder = ''\n", "\n", "s3_download_data('s3://{}'.format(s3_folder), prefix=True, sep=';')" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "# input bucket name and file path on your s3, it is needed to perform any query\n", "s3_bucket = ''\n", @@ -99,16 +121,18 @@ "\n", "# create a test array\n", "arr = np.array(([1,2,3,4], [4,3,2,1]))" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "s3_upload_data(arr, bucket=s3_bucket, path=s3_file_path)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -119,9 +143,7 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "start_day = '01'\n", "end_day='04'\n", @@ -142,7 +164,9 @@ "\n", "df_gam = get_data_from_admanager(query, dimensions, columns, start_date, stop_date)\n", "df_gam.head()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -153,9 +177,7 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "# your account id, it is needed to perform any query\n", "your_id = ''\n", @@ -172,7 +194,9 @@ "\n", "df_ga = ga_request(request, print_sample_size=True, sampling_level='FASTER')\n", "df_ga.head()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -183,14 +207,14 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "new_sheet = google_drive_sheets_create('new_sheet')\n", "\n", "google_drive_sheets_write(df, new_sheet)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -201,9 +225,7 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "input_data_moat = {\n", " 'start' : '20190301',\n", @@ -213,7 +235,9 @@ "\n", "df_moat = get_data_from_moat(input_data_moat, 'moat')\n", "df_moat.head()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -224,9 +248,7 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "presto_query = \"\"\"\n", " SELECT '2019-03-01' as date;\n", @@ -234,7 +256,9 @@ "\n", "data_presto = request_qubole(presto_query, query_type='hive')\n", "data_presto.head()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -245,9 +269,7 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "input_data = {\n", " 'start' : '2018-08-23T00:00:00-07:00',\n", @@ -264,7 +286,9 @@ "\n", "data = get_data_from_rubicon(input_data)\n", "data.head()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -275,20 +299,20 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ "df = query_mysql(\"SELECT * FROM clan LIMIT 10\")\n", "df.head()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": null, "metadata": {}, + "source": [], "outputs": [], - "source": [] + "execution_count": null } ], "metadata": { diff --git a/config.sample.ini b/config.sample.ini index 3b29ba6..c7848c3 100644 --- a/config.sample.ini +++ b/config.sample.ini @@ -35,3 +35,6 @@ database: DATABASE neo4j_username: USERNAME neo4j_password: PASSWORD neo4j_address: ADDRESS:PORT + +[sparql] +endpoint_url: ENDPOINT_URL diff --git a/requirements.txt b/requirements.txt index 8a77f98..9dfab3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,4 +17,5 @@ requests>=2.20 retrying>=1.3.3 urllib3>=1.26.18 py2neo>=4.2.0 +SPARQLWrapper>=2.0.0 db-dtypes diff --git a/setup.py b/setup.py index d8d2047..94165fa 100644 --- a/setup.py +++ b/setup.py @@ -8,10 +8,10 @@ setuptools.setup( name="sroka", - version="0.0.8", + version="0.0.9", author="Ad Engineering FANDOM", author_email="murbanek@fandom.com", - description="Package for access GA, GAM, MOAT, Qubole, Athena, S3, Rubicon APIs, BigQuery, MySQL", + description="Package for access GA, GAM, MOAT, Qubole, Athena, S3, Rubicon APIs, BigQuery, MySQL, SPARQL", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/Wikia/sroka", diff --git a/sroka/api/helpers.py b/sroka/api/helpers.py new file mode 100644 index 0000000..343a594 --- /dev/null +++ b/sroka/api/helpers.py @@ -0,0 +1,22 @@ +import os +from pathlib import Path + + +def save_to_file(df, filename): + # Store the path in a cross-platform pathlib object to ensure compatibility + # with DOS & UNIX-based operating systems. + path = Path(filename) + + # Get the parent directory of the given path, if it exists. + directory_path = str(path.parent.resolve()) + + # If the given path points to a folder, attempt to create it. If it already + # exists, the `exist_ok` option ensures that no exception will be thrown. + if directory_path != "": + os.makedirs(directory_path, exist_ok=True) + + # Export the data in a CSV file. + try: + df.to_csv(filename) + except OSError as e: + print('Unable to write on filesystem: {}'.format(e)) diff --git a/sroka/api/mysql/mysql.py b/sroka/api/mysql/mysql.py index b9f6360..5d81b64 100644 --- a/sroka/api/mysql/mysql.py +++ b/sroka/api/mysql/mysql.py @@ -1,11 +1,10 @@ -import os import mysql.connector import pandas as pd from configparser import NoSectionError -from pathlib import Path from mysql.connector.errors import DatabaseError, OperationalError, InternalError from retrying import retry from sroka.api.mysql.mysql_helpers import validate_options, get_options_from_config +from sroka.api.helpers import save_to_file @retry(stop_max_attempt_number=1, @@ -72,21 +71,5 @@ def query_mysql(query: str, filename=None, # Otherwise, store it in a file. if not filename: return df - - # Store the path in a cross-platform pathlib object to ensure compatibility - # with DOS & UNIX-based operating systems. - path = Path(filename) - - # Get the parent directory of the given path, if it exists. - directory_path = str(path.parent.resolve()) - - # If the given path points to a folder, attempt to create it. If it already - # exists, the `exist_ok` option ensures that no exception will be thrown. - if directory_path != "": - os.makedirs(directory_path, exist_ok=True) - - # Export the data in a CSV file. - try: - df.to_csv(filename) - except OSError as e: - print('Unable to write on filesystem: {}'.format(e)) + else: + save_to_file(df, filename) diff --git a/sroka/api/sparql/README.md b/sroka/api/sparql/README.md new file mode 100644 index 0000000..16ed85f --- /dev/null +++ b/sroka/api/sparql/README.md @@ -0,0 +1,53 @@ +# SPARQL connector + +## Configuration + +Here are the configuration variables that the `sparql` connector supports: +* `endpoint_url`: Endpoint url for SPARQL database. Optional. It needs to be defined either in `config.ini` file or directly in called function. + +## Methods + +### `query_sparql(query, endpoint_url=None, filename=None)` + +#### Arguments + +* string `query` - SPARQL query to run +* string `endpoint_url` - Endpoint URL for SPARQL database +* string `filename` - path to the file in which to store the results (optional, if `filename=None`, results are returned as a [`pandas`](https://pandas.pydata.org/pandas-docs/stable/) [`DataFrame`](https://pandas.pydata.org/pandas-docs/stable/reference/frame.html)) + + +#### Returns + +* A [`DataFrame`](https://pandas.pydata.org/pandas-docs/stable/reference/frame.html) containing the query results or `None` if the data was saved to a file. + +#### Usage + +```python +from sroka.api.sparql.sparql import query_sparql + +## Results saved to the file `results.csv`. Endpoint URL taken from config. +query_sparql(""" +SELECT ?subject ?predicate ?object +WHERE { + ?subject ?predicate ?object . +} +""", filename='results.csv') + +## Results saved to the variable `dataframe`, as a pandas.DataFrame. +dataframe = query_sparql(""" +SELECT ?subject ?predicate ?object +WHERE { + ?subject ?predicate ?object . +} +""") + +## Queries database with configuration different from config.ini file +dataframe = query_sparql(""" +SELECT ?subject ?predicate ?object +WHERE { + ?subject ?predicate ?object . +} +""", endpoint_url='https://query.database.org/sparql') + +``` + diff --git a/sroka/api/sparql/__init__.py b/sroka/api/sparql/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sroka/api/sparql/sparql.py b/sroka/api/sparql/sparql.py new file mode 100644 index 0000000..8abfc92 --- /dev/null +++ b/sroka/api/sparql/sparql.py @@ -0,0 +1,36 @@ +from SPARQLWrapper import SPARQLWrapper, JSON +from sroka.api.sparql.sparql_helpers import get_options_from_config +from configparser import NoSectionError +from sroka.api.helpers import save_to_file +import pandas as pd + + +def query_sparql(query, endpoint_url=None, filename=None): + try: + options = get_options_from_config() + + except NoSectionError: + print('Missing MySQL section in configuration') + return pd.DataFrame([]) + if endpoint_url: + options['endpoint_url'] = endpoint_url + + sparql = SPARQLWrapper(options["endpoint_url"]) + sparql.setQuery(query) + sparql.setReturnFormat(JSON) + results = sparql.queryAndConvert()['results']['bindings'] + dict_results = [] + for result in results: + new_result = {} + for key in result: + new_result[key] = result[key]['value'] + dict_results.append(new_result) + + df = pd.DataFrame.from_dict(dict_results) + + # If no filename is specified, return the data as a pandas Dataframe. + # Otherwise, store it in a file. + if not filename: + return df + else: + save_to_file(df, filename) diff --git a/sroka/api/sparql/sparql_helpers.py b/sroka/api/sparql/sparql_helpers.py new file mode 100644 index 0000000..4bac0ab --- /dev/null +++ b/sroka/api/sparql/sparql_helpers.py @@ -0,0 +1,12 @@ +import sroka.config.config as config +from configparser import NoOptionError + +def get_options_from_config(): + options = dict() + + try: + options["endpoint_url"] = config.get_value('sparql', 'endpoint_url') + except (KeyError, NoOptionError): # Do nothing, this value is optional. + pass + + return options