add fwd_creds flag to send credentials

ncclementi · ncclementi · commit 1ab71c2e7eda · 2021-11-03T17:26:10.000-04:00
diff --git a/dask_bigquery/core.py b/dask_bigquery/core.py
@@ -21,7 +21,7 @@
 
 
 @contextmanager
-def bigquery_clients(project_id, credentials):
+def bigquery_clients(project_id, credentials=None):
     """This context manager is a temporary solution until there is an
     upstream solution to handle this.
     See googleapis/google-cloud-python#9457
@@ -73,17 +73,20 @@ def bigquery_read(
       Name of the BigQuery project.
     read_kwargs: dict
       kwargs to pass to read_rows()
-    creds: dict
-      credentials dictionary
     stream_name: str
       BigQuery Storage API Stream "name"
       NOTE: Please set if reading from Storage API without any `row_restriction`.
             https://cloud.google.com/bigquery/docs/reference/storage/rpc/google.cloud.bigquery.storage.v1beta1#stream
+    cred_token: str
+      google_auth bearer token
     """
 
-    credentials = google.oauth2.credentials.Credentials(cred_token)
+    if cred_token:
+        credentials = google.oauth2.credentials.Credentials(cred_token)
+    else:
+        credentials = None
 
-    with bigquery_clients(project_id, credentials) as (_, bqs_client):
+    with bigquery_clients(project_id, credentials=credentials) as (_, bqs_client):
         session = bqs_client.create_read_session(make_create_read_session_request())
         schema = pyarrow.ipc.read_schema(
             pyarrow.py_buffer(session.arrow_schema.serialized_schema)
@@ -103,6 +106,7 @@ def read_gbq(
     row_filter: str = "",
     columns: list[str] = None,
     read_kwargs: dict = None,
+    fwd_creds: bool = False,
 ):
     """Read table as dask dataframe using BigQuery Storage API via Arrow format.
     Partitions will be approximately balanced according to BigQuery stream allocation logic.
@@ -121,26 +125,35 @@ def read_gbq(
       list of columns to load from the table
     read_kwargs: dict
       kwargs to pass to read_rows()
+    fwd_creds: bool
+      Set to True if user desires to forward credentials to the workers. Default to False.
 
     Returns
     -------
         Dask DataFrame
     """
     read_kwargs = read_kwargs or {}
 
-    creds_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
-    if creds_path is None:
-        raise ValueError("No credentials found")
+    if fwd_creds:
+        creds_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
+        if creds_path is None:
+            raise ValueError("No credentials found")
 
-    credentials = service_account.Credentials.from_service_account_file(
-        creds_path, scopes=["https://www.googleapis.com/auth/bigquery.readonly"]
-    )
-
-    auth_req = google.auth.transport.requests.Request()
-    credentials.refresh(auth_req)
-    cred_token = credentials.token
+        credentials = service_account.Credentials.from_service_account_file(
+            creds_path, scopes=["https://www.googleapis.com/auth/bigquery.readonly"]
+        )
 
-    with bigquery_clients(project_id, credentials) as (bq_client, bqs_client):
+        auth_req = google.auth.transport.requests.Request()
+        credentials.refresh(auth_req)
+        cred_token = credentials.token
+    else:
+        credentials = None
+        cred_token = None
+
+    with bigquery_clients(project_id, credentials=credentials) as (
+        bq_client,
+        bqs_client,
+    ):
         table_ref = bq_client.get_table(f"{dataset_id}.{table_id}")
         if table_ref.table_type == "VIEW":
             raise TypeError("Table type VIEW not supported")
diff --git a/dask_bigquery/tests/test_core.py b/dask_bigquery/tests/test_core.py
@@ -51,43 +51,54 @@ def dataset(df):
         )
 
 
-def test_read_gbq(df, dataset, client):
+@pytest.mark.parametrize("fwd_creds", [False, True])
+def test_read_gbq(df, dataset, fwd_creds, client):
     project_id, dataset_id, table_id = dataset
-    ddf = read_gbq(project_id=project_id, dataset_id=dataset_id, table_id=table_id)
+    ddf = read_gbq(
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        fwd_creds=fwd_creds,
+    )
 
     assert list(ddf.columns) == ["name", "number", "idx"]
     assert ddf.npartitions == 2
     assert assert_eq(ddf.set_index("idx"), df.set_index("idx"))
 
 
-def test_read_row_filter(df, dataset, client):
+@pytest.mark.parametrize("fwd_creds", [False, True])
+def test_read_row_filter(df, dataset, fwd_creds, client):
     project_id, dataset_id, table_id = dataset
     ddf = read_gbq(
         project_id=project_id,
         dataset_id=dataset_id,
         table_id=table_id,
         row_filter="idx < 5",
+        fwd_creds=fwd_creds,
     )
 
     assert list(ddf.columns) == ["name", "number", "idx"]
     assert ddf.npartitions == 2
     assert assert_eq(ddf.set_index("idx").loc[:4], df.set_index("idx").loc[:4])
 
 
-def test_read_kwargs(dataset, client):
+@pytest.mark.parametrize("fwd_creds", [False, True])
+def test_read_kwargs(dataset, fwd_creds, client):
     project_id, dataset_id, table_id = dataset
     ddf = read_gbq(
         project_id=project_id,
         dataset_id=dataset_id,
         table_id=table_id,
         read_kwargs={"timeout": 1e-12},
+        fwd_creds=fwd_creds,
     )
 
     with pytest.raises(Exception, match="Deadline Exceeded"):
         ddf.compute()
 
 
-def test_read_columns(df, dataset, client):
+@pytest.mark.parametrize("fwd_creds", [False, True])
+def test_read_columns(df, dataset, fwd_creds, client):
     project_id, dataset_id, table_id = dataset
     assert df.shape[1] > 1, "Test data should have multiple columns"
 
@@ -97,5 +108,6 @@ def test_read_columns(df, dataset, client):
         dataset_id=dataset_id,
         table_id=table_id,
         columns=columns,
+        fwd_creds=fwd_creds,
     )
     assert list(ddf.columns) == columns