source-google-play: complete initial connector development

Alex-Bair · Alex-Bair · commit 0570ddf9b69b · 2025-08-07T10:25:43.000-04:00
This commit finishes the initial development for `source-google-play`. Some notable decisions made include:
- Title casing all field names. The CSV column headers are not
  consistently named across files. Although I had hoped to avoid
  transformations as much as possible, ensuring fields are consistently
  named makes downstream processing easier for users plus it allows us
  to reuse more code in the connector (ex: primary keys are the same,
  model field definitions are simpler, etc.).
- The `_overview` suffix is used for statistics files that aren't split
  on dimensions, while there is no suffix for reviews that aren't split
  on dimensions. There _are_ other files in the bucket containing data
  split by certain dimensions, and it's very easy to add another binding
  to capture these by overriding the `suffix` class variable for a given
  resource. Those additional bindings aren't needed right now, but
  they'll be easy to add in the future if someone asks for them later.
- Reviews have an "updated_at" type of field that appears to always be
  present. This means that instead of yielding every row of an updated
  file, we can instead only yield rows that have been updated since the
  previous sweep.
- The "Row Number" field doesn't need to be part of any `Statistics`
  primary key since the "Date" and "Package Name" uniquely identify a
  row already. No such combination of unique identifiers exist for
  "Reviews", so we still add "Row Number" into those documents.
diff --git a/source-google-play/source_google_play/api.py b/source-google-play/source_google_play/api.py
@@ -6,6 +6,7 @@
 
 from .models import (
     GooglePlayRow,
+    Reviews,
 )
 
 from .gcs import GCSClient, GCSFileMetadata
@@ -20,14 +21,6 @@ async def fetch_resources(
 ) -> AsyncGenerator[GooglePlayRow | LogCursor, None]:
     assert isinstance(log_cursor, datetime)
 
-    # The code below this return is a best-effort implementation based on the Google Play
-    # documentation about how the GCS bucket organizes data. It boils down to:
-    # - Find all files updated on or after the log_cursor.
-    # - Yield all rows from those files.
-    #
-    # Once we have valid credentials, development can continue and we can iterate on the code below.
-    return
-
     files: list[GCSFileMetadata] = []
     async for file in gcs_client.list_files(prefix=model.prefix, globPattern=model.get_glob_pattern()):
         if file.updated >= log_cursor:
@@ -39,7 +32,15 @@ async def fetch_resources(
             model,
             model.validation_context_model(filename=file.name),
         ):
-            yield row
+            # Reviews have a "Review Last Update Date And Time" field that we can use to
+            # only yield rows that have been updated since the last sweep.
+            if isinstance(row, Reviews):
+                if row.updated_at >= log_cursor:
+                    yield row
+            # All other resources do not have an "updated_at" type field, so we have to
+            # yield all rows for every file that's been updated since the last sweep.
+            else:
+                yield row
 
     if len(files) > 0:
         latest_file = max(files, key=lambda f: f.updated)
@@ -60,15 +61,6 @@ async def backfill_resources(
     if cursor_month >= cutoff:
         return
 
-    # The code below this return is a best-effort implementation based on the Google Play
-    # documentation about how the GCS bucket organizes data. It boils down to:
-    # - Find all files containing data for the same month as the page cursor.
-    # - Yield all rows from those files.
-    # - Stop when the page cursor reaches the cutoff.
-    #
-    # Once we have valid credentials, development can continue and we can iterate on the code below.
-    return
-
     files: list[GCSFileMetadata] = []
     async for file in gcs_client.list_files(prefix=model.prefix, globPattern=model.get_glob_pattern(cursor_month)):
         files.append(file)
diff --git a/source-google-play/source_google_play/gcs.py b/source-google-play/source_google_play/gcs.py
@@ -6,9 +6,13 @@
 from pydantic import BaseModel, Field
 
 from estuary_cdk.http import HTTPSession
-from estuary_cdk.incremental_csv_processor import IncrementalCSVProcessor
+from estuary_cdk.incremental_csv_processor import CSVConfig, IncrementalCSVProcessor
 
 
+CSV_CONFIG = CSVConfig(
+    encoding="utf-16",
+)
+
 _CSVRow = TypeVar('_CSVRow', bound=BaseModel)
 
 
@@ -128,6 +132,7 @@ async def stream_csv(
         processor = IncrementalCSVProcessor(
             body(),
             model,
+            config=CSV_CONFIG,
             validation_context=validation_context
         )
 
diff --git a/source-google-play/source_google_play/models.py b/source-google-play/source_google_play/models.py
@@ -3,22 +3,24 @@
 import re
 
 from estuary_cdk.capture.common import (
-    BaseDocument,
     ConnectorState as GenericConnectorState,
-    LogCursor,
-    Logger,
     ResourceConfig,
     ResourceState,
 )
 from estuary_cdk.flow import (
     GoogleServiceAccount,
     GoogleServiceAccountSpec,
 )
-
+from estuary_cdk.incremental_csv_processor import BaseCSVRow
 
 from pydantic import AwareDatetime, BaseModel, Field, model_validator, ValidationInfo
 
 
+PACKAGE_NAME_FIELD = "Package Name"
+ROW_NUMBER_FIELD = "Row Number"
+MONTH_FIELD = "Month"
+YEAR_FIELD = "Year"
+
 EPOCH = datetime(1970, 1, 1, tzinfo=UTC)
 
 GOOGLE_SPEC = GoogleServiceAccountSpec(
@@ -60,9 +62,10 @@ def increment(self):
         self.count += 1
 
 
-class GooglePlayRow(BaseDocument, extra="allow"):
+class GooglePlayRow(BaseCSVRow, extra="allow"):
     name: ClassVar[str]
     prefix: ClassVar[str]
+    suffix: ClassVar[str | None] = None
     primary_keys: ClassVar[list[str]]
     validation_context_model: ClassVar[type[BaseValidationContext]] = BaseValidationContext
 
@@ -74,27 +77,36 @@ def get_glob_pattern(cls, date: datetime | None = None) -> str:
         if date:
             year_month_pattern =  f"{date.year:04d}{date.month:02d}"
 
-        return f"**_{year_month_pattern}.csv"
+        pattern = f"**_{year_month_pattern}"
+
+        if cls.suffix:
+            pattern += f"_{cls.suffix}"
 
-    package_name: str
-    row_number: int
+        pattern += ".csv"
 
+        return pattern
+
+    package_name: str = Field(alias=PACKAGE_NAME_FIELD)
+
+    # The column naming convention across CSVs is not inherently consistent. ex: Sometimes a column
+    # is named "Package name" and other times it's "Package Name". We normalize the column names to
+    # be title case, which is the predominant casing convention for these columns before we do
+    # perform any transformation.
     @model_validator(mode="before")
     @classmethod
-    def _add_row_number(cls, data: dict[str, Any], info: ValidationInfo) -> dict[str, Any]:
+    def _normalize_field_names(cls, data: dict[str, Any], info: ValidationInfo) -> dict[str, Any]:
+        normalized_data: dict[str, Any] = {}
+        for key, value in data.items():
+            normalized_key = key.title()
+            normalized_data[normalized_key] = value
 
-        if not info.context or not isinstance(info.context, BaseValidationContext):
-            raise RuntimeError(f"Validation context is not set or is not of type BaseValidationContext: {info.context}")
-
-        assert "row_number" not in data, "Row number should not be set before validation."
-        data["row_number"] = info.context.count
-        info.context.increment()
-        return data
+        return normalized_data
 
 
 class Statistics(GooglePlayRow):
-    primary_keys: ClassVar[list[str]] = ["/date", "/package_name", "/row_number"]
-    date: str
+    suffix: ClassVar[str | None] = "overview"
+    primary_keys: ClassVar[list[str]] = ["/Date", f"/{PACKAGE_NAME_FIELD}"]
+    date: str = Field(alias="Date")
 
 
 class Crashes(Statistics):
@@ -110,9 +122,9 @@ class Installs(Statistics):
 class ReviewValidationContext(BaseValidationContext):
     def __init__(self, filename: str):
         super().__init__()
-        self.year_month = self._extract_year_month(filename)
+        self.year, self.month = self._extract_year_month(filename)
 
-    def _extract_year_month(self, filename: str) -> str:
+    def _extract_year_month(self, filename: str) -> tuple[str, str]:
         """
         Extract YYYYMM from review filenames in various formats:
         - /reviews/reviews_[package_name]_YYYYMM.csv
@@ -123,38 +135,47 @@ def _extract_year_month(self, filename: str) -> str:
             filename: The filename or path
 
         Returns:
-            The YYYYMM string.
+            A tuple containing the year and month as strings.
         """
         # Matches reviews_[anything]_YYYYMM[_optionalstuff].csv
         pattern = r'reviews_[^_]+_(\d{6})(?:_[^.]*)?\.csv$'
         match = re.search(pattern, filename)
 
         assert match, f"Filename does not match expected pattern: {filename}"
 
-        return match.group(1)
+        year_month = match.group(1) # YYYYMM
+        year = year_month[:4]
+        month = year_month[4:6]
+
+        return (year, month)
 
 
-# There _might_ be a "Review Last Update Date and Time" we could use to incrementally
-# capture updates within a specific file of Reviews. However, the documentation says it's optional
-# and we haven't see what this data actually looks like. Once we see real data, we can
-# evaluate whether or not the incremental replication strategy for Reviews can be improved.
 class Reviews(GooglePlayRow):
     name: ClassVar[str] = "reviews"
     prefix: ClassVar[str] = "reviews"
-    primary_keys: ClassVar[list[str]] = ["/year_month", "/package_name", "/row_number"]
+    primary_keys: ClassVar[list[str]] = [f"/{YEAR_FIELD}", f"/{MONTH_FIELD}", f"/{PACKAGE_NAME_FIELD}", f"/{ROW_NUMBER_FIELD}"]
     validation_context_model: ClassVar[type[BaseValidationContext]] = ReviewValidationContext
 
-    year_month: str 
+    row_number: int = Field(alias=ROW_NUMBER_FIELD)
+    year: str = Field(alias=YEAR_FIELD)
+    month: str = Field(alias=MONTH_FIELD)
+    updated_at: AwareDatetime = Field(alias="Review Last Update Date And Time")
 
     @model_validator(mode="before")
     @classmethod
-    def _add_year_month(cls, data: dict[str, Any], info: ValidationInfo) -> dict[str, Any]:
+    def _add_primary_key_components(cls, data: dict[str, Any], info: ValidationInfo) -> dict[str, Any]:
 
         if not info.context or not isinstance(info.context, ReviewValidationContext):
             raise RuntimeError(f"Validation context is not set or is not of type ReviewValidationContext: {info.context}")
 
-        assert "year_month" not in data, "year_month should not be set before validation."
-        data["year_month"] = info.context.year_month
+        assert YEAR_FIELD not in data, f"{YEAR_FIELD} should not be set before validation."
+        assert MONTH_FIELD not in data, f"{MONTH_FIELD} should not be set before validation."
+        data[YEAR_FIELD] = info.context.year
+        data[MONTH_FIELD] = info.context.month
+
+        assert ROW_NUMBER_FIELD not in data, f"{ROW_NUMBER_FIELD} should not be set before validation."
+        data[ROW_NUMBER_FIELD] = info.context.count
+        info.context.increment()
         return data
 
 
diff --git a/source-google-play/tests/snapshots/snapshots__discover__capture.stdout.json b/source-google-play/tests/snapshots/snapshots__discover__capture.stdout.json
@@ -41,32 +41,26 @@
           },
           "description": "Document metadata"
         },
-        "package_name": {
+        "Package Name": {
           "title": "Package Name",
           "type": "string"
         },
-        "row_number": {
-          "title": "Row Number",
-          "type": "integer"
-        },
-        "date": {
+        "Date": {
           "title": "Date",
           "type": "string"
         }
       },
       "required": [
-        "package_name",
-        "row_number",
-        "date"
+        "Package Name",
+        "Date"
       ],
       "title": "Crashes",
       "type": "object",
       "x-infer-schema": true
     },
     "key": [
-      "/date",
-      "/package_name",
-      "/row_number"
+      "/Date",
+      "/Package Name"
     ]
   },
   {
@@ -111,32 +105,26 @@
           },
           "description": "Document metadata"
         },
-        "package_name": {
+        "Package Name": {
           "title": "Package Name",
           "type": "string"
         },
-        "row_number": {
-          "title": "Row Number",
-          "type": "integer"
-        },
-        "date": {
+        "Date": {
           "title": "Date",
           "type": "string"
         }
       },
       "required": [
-        "package_name",
-        "row_number",
-        "date"
+        "Package Name",
+        "Date"
       ],
       "title": "Installs",
       "type": "object",
       "x-infer-schema": true
     },
     "key": [
-      "/date",
-      "/package_name",
-      "/row_number"
+      "/Date",
+      "/Package Name"
     ]
   },
   {
@@ -181,32 +169,44 @@
           },
           "description": "Document metadata"
         },
-        "package_name": {
+        "Package Name": {
           "title": "Package Name",
           "type": "string"
         },
-        "row_number": {
+        "Row Number": {
           "title": "Row Number",
           "type": "integer"
         },
-        "year_month": {
-          "title": "Year Month",
+        "Year": {
+          "title": "Year",
+          "type": "string"
+        },
+        "Month": {
+          "title": "Month",
+          "type": "string"
+        },
+        "Review Last Update Date And Time": {
+          "format": "date-time",
+          "title": "Review Last Update Date And Time",
           "type": "string"
         }
       },
       "required": [
-        "package_name",
-        "row_number",
-        "year_month"
+        "Package Name",
+        "Row Number",
+        "Year",
+        "Month",
+        "Review Last Update Date And Time"
       ],
       "title": "Reviews",
       "type": "object",
       "x-infer-schema": true
     },
     "key": [
-      "/package_name",
-      "/row_number",
-      "/year_month"
+      "/Month",
+      "/Package Name",
+      "/Row Number",
+      "/Year"
     ]
   }
 ]