fix linting and tests

pnadolny13 · pnadolny13 · commit 2856c968e4d5 · 2023-01-16T13:18:50.000-05:00
diff --git a/mypy.ini b/mypy.ini
@@ -7,3 +7,6 @@ ignore_missing_imports = True
 
 [mypy-boto3.*]
 ignore_missing_imports = True
+
+[mypy-pytz.*]
+ignore_missing_imports = True
diff --git a/tap_cloudwatch/cloudwatch_api.py b/tap_cloudwatch/cloudwatch_api.py
@@ -2,13 +2,15 @@
 
 import os
 import time
+from collections import deque
 from datetime import datetime, timezone
+from math import ceil
+
+import boto3
 import pytz
 
 from tap_cloudwatch.exception import InvalidQueryException
-from collections import deque
-import boto3
-from math import ceil
+
 
 class CloudwatchAPI:
     """Cloudwatch class for interacting with the API."""
@@ -66,7 +68,7 @@ def _create_client(self, config):
     def _request_more_records():
         return True
 
-    def split_batch_into_windows(self, start_time, end_time, batch_increment_s):
+    def _split_batch_into_windows(self, start_time, end_time, batch_increment_s):
         diff_s = end_time - start_time
         total_batches = ceil(diff_s / batch_increment_s)
         batch_windows = []
@@ -81,62 +83,86 @@ def split_batch_into_windows(self, start_time, end_time, batch_increment_s):
             batch_windows.append((query_start, query_end))
         return batch_windows
 
-    def validate_query(self, query):
+    def _validate_query(self, query):
         if "|sort" in query.replace(" ", ""):
             raise InvalidQueryException("sort not allowed")
         if "|limit" in query.replace(" ", ""):
             raise InvalidQueryException("limit not allowed")
         if "stats" in query:
             raise InvalidQueryException("stats not allowed")
         if "@timestamp" not in query.split("|")[0]:
-            raise InvalidQueryException("@timestamp field is used as the replication key so it must be selected")
+            raise InvalidQueryException(
+                "@timestamp field is used as the replication key so it must be selected"
+            )
 
     def get_records_iterator(self, bookmark, log_group, query, batch_increment_s):
         """Retrieve records from Cloudwatch."""
         end_time = datetime.now(timezone.utc).timestamp()
         start_time = bookmark.timestamp()
-        self.validate_query(query)
-        batch_windows = self.split_batch_into_windows(start_time, end_time, batch_increment_s)
+        self._validate_query(query)
+        batch_windows = self._split_batch_into_windows(
+            start_time, end_time, batch_increment_s
+        )
 
         queue = deque()
         for window in batch_windows:
             if len(queue) < (self.max_concurrent_queries - 1):
-                queue.append((self.start_query(window[0], window[1], log_group, query), window[0], window[1]))
+                queue.append(
+                    (
+                        self._start_query(window[0], window[1], log_group, query),
+                        window[0],
+                        window[1],
+                    )
+                )
             else:
                 query_id, start, end = queue.popleft()
-                queue.append((self.start_query(window[0], window[1], log_group, query), window[0], window[1]))
-                results = self.get_results(log_group, start, end, query, query_id)
+                queue.append(
+                    (
+                        self._start_query(window[0], window[1], log_group, query),
+                        window[0],
+                        window[1],
+                    )
+                )
+                results = self._get_results(log_group, start, end, query, query_id)
                 yield results
 
         while len(queue) > 0:
             query_id, start, end = queue.popleft()
-            results = self.get_results(log_group, start, end, query, query_id)
+            results = self._get_results(log_group, start, end, query, query_id)
             yield results
 
-    def handle_limit_exceeded(self, response, log_group, query_start, query_end, query):
+    def _handle_limit_exceeded(
+        self, response, log_group, query_start, query_end, query
+    ):
         results = response.get("results")
         last_record = results[-1]
 
-        latest_ts_str = [i["value"] for i in last_record if i["field"] == "@timestamp"][0]
+        latest_ts_str = [i["value"] for i in last_record if i["field"] == "@timestamp"][
+            0
+        ]
         # Include latest ts in query, this could cause duplicates but
         # without it we might miss ties
-        new_query_start = int(datetime.fromisoformat(latest_ts_str).replace(tzinfo=pytz.UTC).timestamp())
-        new_query_id = self.start_query(new_query_start, query_end, log_group, query)
-        return self.get_results(log_group, new_query_start, query_end, query, new_query_id)
+        new_query_start = int(
+            datetime.fromisoformat(latest_ts_str).replace(tzinfo=pytz.UTC).timestamp()
+        )
+        new_query_id = self._start_query(new_query_start, query_end, log_group, query)
+        return self._get_results(
+            log_group, new_query_start, query_end, query, new_query_id
+        )
 
-    def alter_query(self, query):
+    def _alter_query(self, query):
         query += " | sort @timestamp asc"
         return query
 
-    def start_query(self, query_start, query_end, log_group, query, prev_start=None):
+    def _start_query(self, query_start, query_end, log_group, query, prev_start=None):
         self.logger.info(
             (
                 "Submitting query for batch from:"
                 f" `{datetime.utcfromtimestamp(query_start).isoformat()} UTC` -"
                 f" `{datetime.utcfromtimestamp(query_end).isoformat()} UTC`"
             )
         )
-        query = self.alter_query(query)
+        query = self._alter_query(query)
         start_query_response = self.client.start_query(
             logGroupName=log_group,
             startTime=query_start,
@@ -146,7 +172,9 @@ def start_query(self, query_start, query_end, log_group, query, prev_start=None)
         )
         return start_query_response["queryId"]
 
-    def get_results(self, log_group, query_start, query_end, query, query_id, prev_start=None):
+    def _get_results(
+        self, log_group, query_start, query_end, query, query_id, prev_start=None
+    ):
         self.logger.info(
             (
                 "Retrieving results for batch from:"
@@ -161,15 +189,19 @@ def get_results(self, log_group, query_start, query_end, query, query_id, prev_s
         if response.get("ResponseMetadata", {}).get("HTTPStatusCode") != 200:
             raise Exception(f"Failed: {response}")
         result_size = response.get("statistics", {}).get("recordsMatched")
-        results = response['results']
-        self.logger.info(
-            f"Result set size '{int(result_size)}' received."
-        )
+        results = response["results"]
+        self.logger.info(f"Result set size '{int(result_size)}' received.")
         if result_size > self.limit:
             if prev_start == query_start:
-                raise Exception("Stuck in a loop, smaller batch still exceeds limit. Reduce batch window.")
+                raise Exception(
+                    "Stuck in a loop, smaller batch still exceeds limit."
+                    "Reduce batch window."
+                )
             self.logger.info(
-                f"Result set size '{int(result_size)}' exceeded limit '{self.limit}'. Re-running sub-batch..."
+                f"Result set size '{int(result_size)}' exceeded limit "
+                f"'{self.limit}'. Re-running sub-batch..."
+            )
+            results += self._handle_limit_exceeded(
+                response, log_group, query_start, query_end, query
             )
-            results += self.handle_limit_exceeded(response, log_group, query_start, query_end, query)
         return results
diff --git a/tap_cloudwatch/exception.py b/tap_cloudwatch/exception.py
@@ -1,3 +1,7 @@
+"""Custom exceptions."""
+
+
 class InvalidQueryException(Exception):
-    "Raised when the input value is less than 18"
-    pass
+    """Raised when the input query is invalid."""
+
+    pass
diff --git a/tap_cloudwatch/streams.py b/tap_cloudwatch/streams.py
@@ -23,16 +23,12 @@ def schema(self):
         # | parse @message "[*] *" as loggingType, loggingMessage
         properties.append(
             th.Property(
-                "ptr",
-                th.StringType(),
-                description="The identifier for the log record."
+                "ptr", th.StringType(), description="The identifier for the log record."
             )
         )
         properties.append(
             th.Property(
-                "timestamp",
-                th.DateTimeType(),
-                description="The timestamp of the log."
+                "timestamp", th.DateTimeType(), description="The timestamp of the log."
             )
         )
         for prop in self.config.get("query").split("|")[0].split(","):
diff --git a/tap_cloudwatch/tests/test_cloudwatch_api.py b/tap_cloudwatch/tests/test_cloudwatch_api.py
@@ -1,37 +1,43 @@
 """Tests cloudwatch api module."""
 
-from tap_cloudwatch.cloudwatch_api import CloudwatchAPI
-from tap_cloudwatch.exception import InvalidQueryException
-import pytest
-from unittest.mock import patch
-from datetime import datetime, timezone
+import logging
+from contextlib import nullcontext as does_not_raise
 
 import boto3
+import pytest
 from botocore.stub import Stubber
 from freezegun import freeze_time
-import logging
-from contextlib import nullcontext as does_not_raise
+
+from tap_cloudwatch.cloudwatch_api import CloudwatchAPI
+from tap_cloudwatch.exception import InvalidQueryException
 
 
 @pytest.mark.parametrize(
-    'start,end,batch,expected',
+    "start,end,batch,expected",
     [
         [1672272000, 1672275600, 3600, [(1672272000, 1672275600)]],
-        [1672272000, 1672275601, 3600, [(1672272000, 1672275600), (1672275601, 1672279200)]],
+        [
+            1672272000,
+            1672275601,
+            3600,
+            [(1672272000, 1672275600), (1672275601, 1672279200)],
+        ],
     ],
 )
 def test_split_batch_into_windows(start, end, batch, expected):
     """Run standard tap tests from the SDK."""
     api = CloudwatchAPI(None)
-    batches = api.split_batch_into_windows(start, end, batch)
+    batches = api._split_batch_into_windows(start, end, batch)
     assert batches == expected
 
 
-
 @pytest.mark.parametrize(
-    'query,expectation',
+    "query,expectation",
     [
-        ["fields @timestamp, @message | sort @timestamp desc", pytest.raises(InvalidQueryException)],
+        [
+            "fields @timestamp, @message | sort @timestamp desc",
+            pytest.raises(InvalidQueryException),
+        ],
         ["fields @timestamp, @message | limit 5", pytest.raises(InvalidQueryException)],
         ["stats count(*) by duration as time", pytest.raises(InvalidQueryException)],
         ["fields @message", pytest.raises(InvalidQueryException)],
@@ -42,7 +48,8 @@ def test_validate_query(query, expectation):
     """Run standard tap tests from the SDK."""
     api = CloudwatchAPI(None)
     with expectation:
-        api.validate_query(query)
+        api._validate_query(query)
+
 
 @freeze_time("2022-12-30")
 def test_handle_batch_window():
@@ -66,7 +73,7 @@ def test_handle_batch_window():
             ]
         ],
         "ResponseMetadata": {"HTTPStatusCode": 200},
-        "statistics": {"recordsMatched": 10000}
+        "statistics": {"recordsMatched": 10000},
     }
     stubber.add_response(
         "start_query",
@@ -86,7 +93,7 @@ def test_handle_batch_window():
     )
     stubber.activate()
 
-    query_id = api.start_query(query_start, query_end, log_group, in_query)
-    output = api.get_results(log_group, query_start, query_end, in_query, query_id)
-        
-    assert response["results"] == output
+    query_id = api._start_query(query_start, query_end, log_group, in_query)
+    output = api._get_results(log_group, query_start, query_end, in_query, query_id)
+
+    assert response["results"] == output
diff --git a/tap_cloudwatch/tests/test_core.py b/tap_cloudwatch/tests/test_core.py
@@ -15,7 +15,7 @@
     "query": "fields @timestamp, @message",
     "aws_region_name": "us-east-1",
     "start_date": "2022-12-29",
-    "batch_increment_s": 86400
+    "batch_increment_s": 86400,
 }
 
 client = boto3.client("logs", region_name="us-east-1")
@@ -49,7 +49,7 @@ def test_standard_tap_tests(patch_client):
                 ]
             ],
             "ResponseMetadata": {"HTTPStatusCode": 200},
-            "statistics": {"recordsMatched": 0}
+            "statistics": {"recordsMatched": 0},
         },
         {"queryId": "123"},
     )

Original file line number	Diff line number	Diff line change
`@@ -23,16 +23,12 @@ def schema(self):`
`23`	`23`	`# \| parse @message "[] " as loggingType, loggingMessage`
`24`	`24`	`properties.append(`
`25`	`25`	`th.Property(`
`26`		`- "ptr",`
`27`		`- th.StringType(),`
`28`		`- description="The identifier for the log record."`
	`26`	`+ "ptr", th.StringType(), description="The identifier for the log record."`
`29`	`27`	`)`
`30`	`28`	`)`
`31`	`29`	`properties.append(`
`32`	`30`	`th.Property(`
`33`		`- "timestamp",`
`34`		`- th.DateTimeType(),`
`35`		`- description="The timestamp of the log."`
	`31`	`+ "timestamp", th.DateTimeType(), description="The timestamp of the log."`
`36`	`32`	`)`
`37`	`33`	`)`
`38`	`34`	`for prop in self.config.get("query").split("\|")[0].split(","):`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"query": "fields @timestamp, @message",`
`16`	`16`	`"aws_region_name": "us-east-1",`
`17`	`17`	`"start_date": "2022-12-29",`
`18`		`- "batch_increment_s": 86400`
	`18`	`+ "batch_increment_s": 86400,`
`19`	`19`	`}`
`20`	`20`
`21`	`21`	`client = boto3.client("logs", region_name="us-east-1")`
`@@ -49,7 +49,7 @@ def test_standard_tap_tests(patch_client):`
`49`	`49`	`]`
`50`	`50`	`],`
`51`	`51`	`"ResponseMetadata": {"HTTPStatusCode": 200},`
`52`		`- "statistics": {"recordsMatched": 0}`
	`52`	`+ "statistics": {"recordsMatched": 0},`
`53`	`53`	`},`
`54`	`54`	`{"queryId": "123"},`
`55`	`55`	`)`