[io] feat: switch to pandas_gbq to write data to BigQuery

AndreyMarkinPPC · AndreyMarkinPPC · commit ece70f9fb4ea · 2025-10-24T16:03:24.000+04:00
diff --git a/libs/io/garf_io/__init__.py b/libs/io/garf_io/__init__.py
@@ -14,4 +14,4 @@
 
 """Writing GarfReport to anywhere."""
 
-__version__ = '0.0.13'
+__version__ = '0.0.14'
diff --git a/libs/io/garf_io/writers/bigquery_writer.py b/libs/io/garf_io/writers/bigquery_writer.py
@@ -16,22 +16,20 @@
 from __future__ import annotations
 
 import os
+from typing import Literal
 
 try:
+  import pandas as pd
+  import pandas_gbq
   from google.cloud import bigquery
 except ImportError as e:
   raise ImportError(
     'Please install garf-io with BigQuery support - `pip install garf-io[bq]`'
   ) from e
 
-import datetime
 import logging
-from collections.abc import Sequence
 
 import numpy as np
-import pandas as pd
-import proto  # type: ignore
-from garf_core import parsers
 from garf_core import report as garf_report
 from google.cloud import exceptions as google_cloud_exceptions
 
@@ -40,6 +38,13 @@
 
 logger = logging.getLogger(__name__)
 
+_WRITE_DISPOSITION_MAPPING = {
+  'WRITE_TRUNCATE': 'replace',
+  'WRITE_TRUNCATE_DATA': 'replace',
+  'WRITE_APPEND': 'append',
+  'WRITE_EMPTY': 'fail',
+}
+
 
 class BigQueryWriterError(exceptions.GarfIoError):
   """BigQueryWriter specific errors."""
@@ -60,9 +65,8 @@ def __init__(
     project: str | None = os.getenv('GOOGLE_CLOUD_PROJECT'),
     dataset: str = 'garf',
     location: str = 'US',
-    write_disposition: bigquery.WriteDisposition | str = (
-      bigquery.WriteDisposition.WRITE_TRUNCATE
-    ),
+    write_disposition: bigquery.WriteDisposition
+    | Literal['append', 'replace', 'fail'] = 'replace',
     **kwargs,
   ):
     """Initializes BigQueryWriter.
@@ -83,11 +87,20 @@ def __init__(
     self.project = project
     self.dataset_id = f'{project}.{dataset}'
     self.location = location
-    if isinstance(write_disposition, str):
-      write_disposition = getattr(
-        bigquery.WriteDisposition, write_disposition.upper()
+    if write_disposition in ('replace', 'append', 'fail'):
+      self.write_disposition = write_disposition
+    elif isinstance(write_disposition, bigquery.WriteDisposition):
+      self.write_disposition = _WRITE_DISPOSITION_MAPPING.get(
+        write_disposition.name
+      )
+    elif _WRITE_DISPOSITION_MAPPING.get(write_disposition.upper()):
+      self.write_disposition = _WRITE_DISPOSITION_MAPPING.get(
+        write_disposition.upper()
+      )
+    else:
+      raise BigQueryWriterError(
+        'Unsupported writer disposition, choose one of: replace, append, fail'
       )
-    self.write_disposition = write_disposition
 
   def __str__(self) -> str:
     return f'[BigQuery] - {self.dataset_id} at {self.location} location.'
@@ -118,19 +131,9 @@ def write(self, report: garf_report.GarfReport, destination: str) -> str:
       Name of the table in `dataset.table` format.
     """
     report = self.format_for_write(report)
-    schema = _define_schema(report)
     destination = formatter.format_extension(destination)
     _ = self.create_or_get_dataset()
-    table = self._create_or_get_table(
-      f'{self.dataset_id}.{destination}', schema
-    )
-    job_config = bigquery.LoadJobConfig(
-      write_disposition=self.write_disposition,
-      schema=schema,
-      source_format='CSV',
-      max_bad_records=len(report),
-    )
-
+    table = f'{self.dataset_id}.{destination}'
     if not report:
       df = pd.DataFrame(
         data=report.results_placeholder, columns=report.column_names
@@ -139,123 +142,8 @@ def write(self, report: garf_report.GarfReport, destination: str) -> str:
       df = report.to_pandas()
     df = df.replace({np.nan: None})
     logger.debug('Writing %d rows of data to %s', len(df), destination)
-    job = self.client.load_table_from_dataframe(
-      dataframe=df, destination=table, job_config=job_config
+    pandas_gbq.to_gbq(
+      dataframe=df, destination_table=table, if_exists=self.write_disposition
     )
-    try:
-      job.result()
-      logger.debug('Writing to %s is completed', destination)
-    except google_cloud_exceptions.BadRequest as e:
-      raise ValueError(f'Unable to save data to BigQuery! {str(e)}') from e
+    logger.debug('Writing to %s is completed', destination)
     return f'[BigQuery] - at {self.dataset_id}.{destination}'
-
-  def _create_or_get_table(
-    self, table_name: str, schema: Sequence[bigquery.SchemaField]
-  ) -> bigquery.Table:
-    """Gets existing table or create a new one.
-
-    Args:
-      table_name: Name of the table in BigQuery.
-      schema: Schema of the table if one should be created.
-
-    Returns:
-      BigQuery table object.
-    """
-    try:
-      table = self.client.get_table(table_name)
-    except google_cloud_exceptions.NotFound:
-      table_ref = bigquery.Table(table_name, schema=schema)
-      table = self.client.create_table(table_ref)
-      table = self.client.get_table(table_name)
-    return table
-
-
-def _define_schema(
-  report: garf_report.GarfReport,
-) -> list[bigquery.SchemaField]:
-  """Infers schema from GarfReport.
-
-  Args:
-    report: GarfReport to infer schema from.
-
-  Returns:
-    Schema fields for a given report.
-
-  """
-  result_types = _get_result_types(report)
-  return _get_bq_schema(result_types)
-
-
-def _get_result_types(
-  report: garf_report.GarfReport,
-) -> dict[str, dict[str, parsers.ApiRowElement]]:
-  """Maps each column of report to BigQuery field type and repeated status.
-
-  Fields types are inferred based on report results or results placeholder.
-
-  Args:
-    report: GarfReport to infer field types from.
-
-  Returns:
-    Mapping between each column of report and its field type.
-  """
-  result_types: dict[str, dict[str, parsers.ApiRowElement]] = {}
-  column_names = report.column_names
-  for row in report.results or report.results_placeholder:
-    if set(column_names) == set(result_types.keys()):
-      break
-    for i, field in enumerate(row):
-      if field is None or column_names[i] in result_types:
-        continue
-      field_type = type(field)
-      if field_type in [
-        list,
-        proto.marshal.collections.repeated.RepeatedComposite,
-        proto.marshal.collections.repeated.Repeated,
-      ]:
-        repeated = True
-        field_type = str if len(field) == 0 else type(field[0])
-      else:
-        field_type = type(field)
-        repeated = False
-      result_types[column_names[i]] = {
-        'field_type': field_type,
-        'repeated': repeated,
-      }
-  return result_types
-
-
-def _get_bq_schema(
-  types: dict[str, dict[str, parsers.ApiRowElement]],
-) -> list[bigquery.SchemaField]:
-  """Converts report fields types to BigQuery schema fields.
-
-  Args:
-    types: Mapping between column names and its field type.
-
-  Returns:
-     BigQuery schema fields corresponding to GarfReport.
-  """
-  type_mapping = {
-    list: 'REPEATED',
-    str: 'STRING',
-    datetime.datetime: 'DATETIME',
-    datetime.date: 'DATE',
-    int: 'INT64',
-    float: 'FLOAT64',
-    bool: 'BOOL',
-    proto.marshal.collections.repeated.RepeatedComposite: 'REPEATED',
-    proto.marshal.collections.repeated.Repeated: 'REPEATED',
-  }
-
-  schema: list[bigquery.SchemaField] = []
-  for key, value in types.items():
-    field_type = type_mapping.get(value.get('field_type'))
-    schema.append(
-      bigquery.SchemaField(
-        name=key,
-        field_type=field_type if field_type else 'STRING',
-        mode='REPEATED' if value.get('repeated') else 'NULLABLE',
-      )
-    )
-  return schema
diff --git a/libs/io/tests/unit/writers/test_bigquery_writer.py b/libs/io/tests/unit/writers/test_bigquery_writer.py
@@ -13,46 +13,48 @@
 # limitations under the License.
 from __future__ import annotations
 
+import os
+
+import garf_core
 import pytest
 from garf_io.writers import bigquery_writer
 from google.cloud import bigquery
 
 
 class TestBigQueryWriter:
-  @pytest.fixture
-  def bq_writer(self):
-    return bigquery_writer.BigQueryWriter(project='test', dataset='test')
-
-  def test_get_results_types_returns_correct_mapping(self, sample_data):
-    result_types = bigquery_writer._get_result_types(sample_data)
-    assert result_types == {
-      'column_1': {'field_type': int, 'repeated': False},
-      'column_2': {'field_type': str, 'repeated': False},
-      'column_3': {'field_type': int, 'repeated': True},
-    }
-
-  def test_define_schema_returns_correct_schema_fields(self, sample_data):
-    schema = bigquery_writer._define_schema(sample_data)
-    assert schema == [
-      bigquery.SchemaField(
-        'column_1', 'INT64', 'NULLABLE', None, None, (), None
-      ),
-      bigquery.SchemaField(
-        'column_2', 'STRING', 'NULLABLE', None, None, (), None
-      ),
-      bigquery.SchemaField(
-        'column_3', 'INT64', 'REPEATED', None, None, (), None
-      ),
-    ]
+  @pytest.mark.skipif(
+    not os.environ.get('GOOGLE_CLOUD_PROJECT'),
+    reason='GOOGLE_CLOUD_PROJECT env variable not set.',
+  )
+  def test_write(self):
+    writer = bigquery_writer.BigQueryWriter(array_handling='arrays')
+    report = garf_core.GarfReport(
+      results=[
+        [{'key': ['one', 'two']}, 'three'],
+      ],
+      column_names=['column1', 'column2'],
+    )
+    result = writer.write(report, 'test')
+    assert result
 
-  def test_define_schema_correctly_handles_dates(self, sample_data_with_dates):
-    schema = bigquery_writer._define_schema(sample_data_with_dates)
-    assert schema == [
-      bigquery.SchemaField(
-        'column_1', 'INT64', 'NULLABLE', None, None, (), None
-      ),
-      bigquery.SchemaField(
-        'datetime', 'DATETIME', 'NULLABLE', None, None, (), None
-      ),
-      bigquery.SchemaField('date', 'DATE', 'NULLABLE', None, None, (), None),
-    ]
+  @pytest.mark.parametrize(
+    ('disposition', 'expected'),
+    [
+      ('append', 'append'),
+      ('replace', 'replace'),
+      ('fail', 'fail'),
+      ('write_append', 'append'),
+      ('write_truncate', 'replace'),
+      ('write_truncate_data', 'replace'),
+      ('write_empty', 'fail'),
+      (bigquery.WriteDisposition.WRITE_APPEND, 'append'),
+      (bigquery.WriteDisposition.WRITE_TRUNCATE, 'replace'),
+      (bigquery.WriteDisposition.WRITE_TRUNCATE_DATA, 'replace'),
+      (bigquery.WriteDisposition.WRITE_EMPTY, 'fail'),
+    ],
+  )
+  def test_init_creates_correct_write_disposition(self, disposition, expected):
+    writer = bigquery_writer.BigQueryWriter(
+      project='test', write_disposition=disposition
+    )
+    assert writer.write_disposition == expected

Original file line number	Diff line number	Diff line change
`@@ -14,4 +14,4 @@`
`14`	`14`
`15`	`15`	`"""Writing GarfReport to anywhere."""`
`16`	`16`
`17`		`-__version__ = '0.0.13'`
	`17`	`+__version__ = '0.0.14'`