Skip to content

Adding SPARQL data querying #57

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Package providing simple Python access to data in:
* AWS s3
* MySQL
* neo4j
* SPARQL

Sroka library was checked to work for Python **>=3.8, <=3.11**.

Expand Down
100 changes: 62 additions & 38 deletions Test APIs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# GA API\n",
"from sroka.api.ga.ga import ga_request\n",
Expand All @@ -43,10 +41,36 @@
"from sroka.api.s3_connection.s3_connection_api import s3_download_data, s3_upload_data\n",
"# MySQL API\n",
"from sroka.api.mysql.mysql import query_mysql\n",
"# SPARQL API\n",
"from sroka.api.sparql.sparql import query_sparql\n",
"\n",
"# data wrangling\n",
"import numpy as np"
]
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# SPARQL"
},
{
"metadata": {},
"cell_type": "code",
"source": [
"df = query_sparql(\"\"\"SELECT \n",
"\t?game \n",
"\t?gameLabel \n",
"\t?gameTitle\n",
"WHERE {\n",
"\t?game wdt:P31 wd:Q7889 . # instance of video game\n",
"}\n",
"LIMIT 10\"\"\", endpoint_url='https://query.wikidata.org/sparql')\n",
"df"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
Expand All @@ -57,16 +81,16 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = query_athena(\"\"\"\n",
" SELECT '2019-03-01' as date\n",
" \"\"\")\n",
"\n",
"df"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
Expand All @@ -77,38 +101,38 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# input a path to data on your s3, it is needed to perform any query\n",
"s3_folder = ''\n",
"\n",
"s3_download_data('s3://{}'.format(s3_folder), prefix=True, sep=';')"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# input bucket name and file path on your s3, it is needed to perform any query\n",
"s3_bucket = ''\n",
"s3_file_path = ''\n",
"\n",
"# create a test array\n",
"arr = np.array(([1,2,3,4], [4,3,2,1]))"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"s3_upload_data(arr, bucket=s3_bucket, path=s3_file_path)"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
Expand All @@ -119,9 +143,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"start_day = '01'\n",
"end_day='04'\n",
Expand All @@ -142,7 +164,9 @@
"\n",
"df_gam = get_data_from_admanager(query, dimensions, columns, start_date, stop_date)\n",
"df_gam.head()"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
Expand All @@ -153,9 +177,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# your account id, it is needed to perform any query\n",
"your_id = ''\n",
Expand All @@ -172,7 +194,9 @@
"\n",
"df_ga = ga_request(request, print_sample_size=True, sampling_level='FASTER')\n",
"df_ga.head()"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
Expand All @@ -183,14 +207,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"new_sheet = google_drive_sheets_create('new_sheet')\n",
"\n",
"google_drive_sheets_write(df, new_sheet)"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
Expand All @@ -201,9 +225,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"input_data_moat = {\n",
" 'start' : '20190301',\n",
Expand All @@ -213,7 +235,9 @@
"\n",
"df_moat = get_data_from_moat(input_data_moat, 'moat')\n",
"df_moat.head()"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
Expand All @@ -224,17 +248,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"presto_query = \"\"\"\n",
" SELECT '2019-03-01' as date;\n",
" \"\"\"\n",
"\n",
"data_presto = request_qubole(presto_query, query_type='hive')\n",
"data_presto.head()"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
Expand All @@ -245,9 +269,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"input_data = {\n",
" 'start' : '2018-08-23T00:00:00-07:00',\n",
Expand All @@ -264,7 +286,9 @@
"\n",
"data = get_data_from_rubicon(input_data)\n",
"data.head()"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
Expand All @@ -275,20 +299,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = query_mysql(\"SELECT * FROM clan LIMIT 10\")\n",
"df.head()"
]
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [],
"outputs": [],
"source": []
"execution_count": null
}
],
"metadata": {
Expand Down
3 changes: 3 additions & 0 deletions config.sample.ini
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ database: DATABASE
neo4j_username: USERNAME
neo4j_password: PASSWORD
neo4j_address: ADDRESS:PORT

[sparql]
endpoint_url: ENDPOINT_URL
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ requests>=2.20
retrying>=1.3.3
urllib3>=1.26.18
py2neo>=4.2.0
SPARQLWrapper>=2.0.0
db-dtypes
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

setuptools.setup(
name="sroka",
version="0.0.8",
version="0.0.9",
author="Ad Engineering FANDOM",
author_email="[email protected]",
description="Package for access GA, GAM, MOAT, Qubole, Athena, S3, Rubicon APIs, BigQuery, MySQL",
description="Package for access GA, GAM, MOAT, Qubole, Athena, S3, Rubicon APIs, BigQuery, MySQL, SPARQL",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/Wikia/sroka",
Expand Down
22 changes: 22 additions & 0 deletions sroka/api/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os
from pathlib import Path


def save_to_file(df, filename):
# Store the path in a cross-platform pathlib object to ensure compatibility
# with DOS & UNIX-based operating systems.
path = Path(filename)

# Get the parent directory of the given path, if it exists.
directory_path = str(path.parent.resolve())

# If the given path points to a folder, attempt to create it. If it already
# exists, the `exist_ok` option ensures that no exception will be thrown.
if directory_path != "":
os.makedirs(directory_path, exist_ok=True)

# Export the data in a CSV file.
try:
df.to_csv(filename)
except OSError as e:
print('Unable to write on filesystem: {}'.format(e))
23 changes: 3 additions & 20 deletions sroka/api/mysql/mysql.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import os
import mysql.connector
import pandas as pd
from configparser import NoSectionError
from pathlib import Path
from mysql.connector.errors import DatabaseError, OperationalError, InternalError
from retrying import retry
from sroka.api.mysql.mysql_helpers import validate_options, get_options_from_config
from sroka.api.helpers import save_to_file


@retry(stop_max_attempt_number=1,
Expand Down Expand Up @@ -72,21 +71,5 @@ def query_mysql(query: str, filename=None,
# Otherwise, store it in a file.
if not filename:
return df

# Store the path in a cross-platform pathlib object to ensure compatibility
# with DOS & UNIX-based operating systems.
path = Path(filename)

# Get the parent directory of the given path, if it exists.
directory_path = str(path.parent.resolve())

# If the given path points to a folder, attempt to create it. If it already
# exists, the `exist_ok` option ensures that no exception will be thrown.
if directory_path != "":
os.makedirs(directory_path, exist_ok=True)

# Export the data in a CSV file.
try:
df.to_csv(filename)
except OSError as e:
print('Unable to write on filesystem: {}'.format(e))
else:
save_to_file(df, filename)
Loading