Merge pull request #1 from pnadolny13/multiple_queries

pnadolny13 · web-flow · commit da5dfcd5d596 · 2023-01-16T15:27:15.000-03:00
Multiple concurrent queries
diff --git a/mypy.ini b/mypy.ini
@@ -7,3 +7,6 @@ ignore_missing_imports = True
 
 [mypy-boto3.*]
 ignore_missing_imports = True
+
+[mypy-pytz.*]
+ignore_missing_imports = True
diff --git a/tap_cloudwatch/client.py b/tap_cloudwatch/client.py
@@ -29,5 +29,5 @@ def get_records(self, context: Optional[dict]) -> Iterable[dict]:
             self.config.get("batch_increment_s"),
         )
         for batch in cloudwatch_iter:
-            for record in batch.get("results"):
+            for record in batch:
                 yield {i["field"][1:]: i["value"] for i in record}
diff --git a/tap_cloudwatch/cloudwatch_api.py b/tap_cloudwatch/cloudwatch_api.py
@@ -2,13 +2,15 @@
 
 import os
 import time
+from collections import deque
 from datetime import datetime, timezone
+from math import ceil
+
+import boto3
 import pytz
 
 from tap_cloudwatch.exception import InvalidQueryException
 
-import boto3
-from math import ceil
 
 class CloudwatchAPI:
     """Cloudwatch class for interacting with the API."""
@@ -17,6 +19,8 @@ def __init__(self, logger):
         """Initialize CloudwatchAPI."""
         self._client = None
         self.logger = logger
+        self.limit = 10000
+        self.max_concurrent_queries = 20
 
     @property
     def client(self):
@@ -64,7 +68,7 @@ def _create_client(self, config):
     def _request_more_records():
         return True
 
-    def split_batch_into_windows(self, start_time, end_time, batch_increment_s):
+    def _split_batch_into_windows(self, start_time, end_time, batch_increment_s):
         diff_s = end_time - start_time
         total_batches = ceil(diff_s / batch_increment_s)
         batch_windows = []
@@ -79,71 +83,125 @@ def split_batch_into_windows(self, start_time, end_time, batch_increment_s):
             batch_windows.append((query_start, query_end))
         return batch_windows
 
-    def validate_query(self, query):
+    def _validate_query(self, query):
         if "|sort" in query.replace(" ", ""):
             raise InvalidQueryException("sort not allowed")
         if "|limit" in query.replace(" ", ""):
             raise InvalidQueryException("limit not allowed")
         if "stats" in query:
             raise InvalidQueryException("stats not allowed")
         if "@timestamp" not in query.split("|")[0]:
-            raise InvalidQueryException("@timestamp field is used as the replication key so it must be selected")
+            raise InvalidQueryException(
+                "@timestamp field is used as the replication key so it must be selected"
+            )
 
     def get_records_iterator(self, bookmark, log_group, query, batch_increment_s):
         """Retrieve records from Cloudwatch."""
         end_time = datetime.now(timezone.utc).timestamp()
         start_time = bookmark.timestamp()
-        self.validate_query(query)
-        batch_windows = self.split_batch_into_windows(start_time, end_time, batch_increment_s)
+        self._validate_query(query)
+        batch_windows = self._split_batch_into_windows(
+            start_time, end_time, batch_increment_s
+        )
 
+        queue = deque()
         for window in batch_windows:
-            yield self.handle_batch_window(window[0], window[1], log_group, query)
-
-    def handle_limit_exceeded(self, response, log_group, query_start, query_end, query):
+            if len(queue) < (self.max_concurrent_queries - 1):
+                queue.append(
+                    (
+                        self._start_query(window[0], window[1], log_group, query),
+                        window[0],
+                        window[1],
+                    )
+                )
+            else:
+                query_id, start, end = queue.popleft()
+                queue.append(
+                    (
+                        self._start_query(window[0], window[1], log_group, query),
+                        window[0],
+                        window[1],
+                    )
+                )
+                results = self._get_results(log_group, start, end, query, query_id)
+                yield results
+
+        while len(queue) > 0:
+            query_id, start, end = queue.popleft()
+            results = self._get_results(log_group, start, end, query, query_id)
+            yield results
+
+    def _handle_limit_exceeded(
+        self, response, log_group, query_start, query_end, query
+    ):
         results = response.get("results")
         last_record = results[-1]
 
-        latest_ts_str = [i["value"] for i in last_record if i["field"] == "@timestamp"][0]
+        latest_ts_str = [i["value"] for i in last_record if i["field"] == "@timestamp"][
+            0
+        ]
         # Include latest ts in query, this could cause duplicates but
         # without it we might miss ties
-        query_start = int(datetime.fromisoformat(latest_ts_str).replace(tzinfo=pytz.UTC).timestamp())
-        self.handle_batch_window(query_start, query_end, log_group, query, prev_start=query_start)
+        new_query_start = int(
+            datetime.fromisoformat(latest_ts_str).replace(tzinfo=pytz.UTC).timestamp()
+        )
+        new_query_id = self._start_query(new_query_start, query_end, log_group, query)
+        return self._get_results(
+            log_group, new_query_start, query_end, query, new_query_id
+        )
 
-    def alter_query(self, query):
+    def _alter_query(self, query):
         query += " | sort @timestamp asc"
         return query
 
-    def handle_batch_window(self, query_start, query_end, log_group, query, prev_start=None):
+    def _start_query(self, query_start, query_end, log_group, query, prev_start=None):
         self.logger.info(
             (
-                "Retrieving batch from:"
+                "Submitting query for batch from:"
                 f" `{datetime.utcfromtimestamp(query_start).isoformat()} UTC` -"
                 f" `{datetime.utcfromtimestamp(query_end).isoformat()} UTC`"
             )
         )
-        limit = 10000
-        query = self.alter_query(query)
+        query = self._alter_query(query)
         start_query_response = self.client.start_query(
             logGroupName=log_group,
             startTime=query_start,
             endTime=query_end,
             queryString=query,
-            limit=limit,
+            limit=self.limit,
         )
+        return start_query_response["queryId"]
 
-        query_id = start_query_response["queryId"]
-        response = None
+    def _get_results(
+        self, log_group, query_start, query_end, query, query_id, prev_start=None
+    ):
+        self.logger.info(
+            (
+                "Retrieving results for batch from:"
+                f" `{datetime.utcfromtimestamp(query_start).isoformat()} UTC` -"
+                f" `{datetime.utcfromtimestamp(query_end).isoformat()} UTC`"
+            )
+        )
+        response = self.client.get_query_results(queryId=query_id)
         while response is None or response["status"] == "Running":
-            time.sleep(1)
+            time.sleep(0.5)
             response = self.client.get_query_results(queryId=query_id)
         if response.get("ResponseMetadata", {}).get("HTTPStatusCode") != 200:
             raise Exception(f"Failed: {response}")
         result_size = response.get("statistics", {}).get("recordsMatched")
-        if result_size > limit:
+        results = response["results"]
+        self.logger.info(f"Result set size '{int(result_size)}' received.")
+        if result_size > self.limit:
             if prev_start == query_start:
-                raise Exception("Stuck in a loop, smaller batch still exceeds limit. Reduce batch window.")
+                raise Exception(
+                    "Stuck in a loop, smaller batch still exceeds limit."
+                    "Reduce batch window."
+                )
             self.logger.info(
-                f"Result set size '{int(result_size)}' exceeded limit '{limit}'. Re-running sub-batch..."
+                f"Result set size '{int(result_size)}' exceeded limit "
+                f"'{self.limit}'. Re-running sub-batch..."
+            )
+            results += self._handle_limit_exceeded(
+                response, log_group, query_start, query_end, query
             )
-            self.handle_limit_exceeded(response, log_group, query_start, query_end, query)
-        return response
+        return results
diff --git a/tap_cloudwatch/exception.py b/tap_cloudwatch/exception.py
@@ -1,3 +1,7 @@
+"""Custom exceptions."""
+
+
 class InvalidQueryException(Exception):
-    "Raised when the input value is less than 18"
-    pass
+    """Raised when the input query is invalid."""
+
+    pass
diff --git a/tap_cloudwatch/streams.py b/tap_cloudwatch/streams.py
@@ -23,16 +23,12 @@ def schema(self):
         # | parse @message "[*] *" as loggingType, loggingMessage
         properties.append(
             th.Property(
-                "ptr",
-                th.StringType(),
-                description="The identifier for the log record."
+                "ptr", th.StringType(), description="The identifier for the log record."
             )
         )
         properties.append(
             th.Property(
-                "timestamp",
-                th.DateTimeType(),
-                description="The timestamp of the log."
+                "timestamp", th.DateTimeType(), description="The timestamp of the log."
             )
         )
         for prop in self.config.get("query").split("|")[0].split(","):
diff --git a/tap_cloudwatch/tests/test_cloudwatch_api.py b/tap_cloudwatch/tests/test_cloudwatch_api.py
@@ -1,37 +1,43 @@
 """Tests cloudwatch api module."""
 
-from tap_cloudwatch.cloudwatch_api import CloudwatchAPI
-from tap_cloudwatch.exception import InvalidQueryException
-import pytest
-from unittest.mock import patch
-from datetime import datetime, timezone
+import logging
+from contextlib import nullcontext as does_not_raise
 
 import boto3
+import pytest
 from botocore.stub import Stubber
 from freezegun import freeze_time
-import logging
-from contextlib import nullcontext as does_not_raise
+
+from tap_cloudwatch.cloudwatch_api import CloudwatchAPI
+from tap_cloudwatch.exception import InvalidQueryException
 
 
 @pytest.mark.parametrize(
-    'start,end,batch,expected',
+    "start,end,batch,expected",
     [
         [1672272000, 1672275600, 3600, [(1672272000, 1672275600)]],
-        [1672272000, 1672275601, 3600, [(1672272000, 1672275600), (1672275601, 1672279200)]],
+        [
+            1672272000,
+            1672275601,
+            3600,
+            [(1672272000, 1672275600), (1672275601, 1672279200)],
+        ],
     ],
 )
 def test_split_batch_into_windows(start, end, batch, expected):
     """Run standard tap tests from the SDK."""
     api = CloudwatchAPI(None)
-    batches = api.split_batch_into_windows(start, end, batch)
+    batches = api._split_batch_into_windows(start, end, batch)
     assert batches == expected
 
 
-
 @pytest.mark.parametrize(
-    'query,expectation',
+    "query,expectation",
     [
-        ["fields @timestamp, @message | sort @timestamp desc", pytest.raises(InvalidQueryException)],
+        [
+            "fields @timestamp, @message | sort @timestamp desc",
+            pytest.raises(InvalidQueryException),
+        ],
         ["fields @timestamp, @message | limit 5", pytest.raises(InvalidQueryException)],
         ["stats count(*) by duration as time", pytest.raises(InvalidQueryException)],
         ["fields @message", pytest.raises(InvalidQueryException)],
@@ -42,7 +48,8 @@ def test_validate_query(query, expectation):
     """Run standard tap tests from the SDK."""
     api = CloudwatchAPI(None)
     with expectation:
-        api.validate_query(query)
+        api._validate_query(query)
+
 
 @freeze_time("2022-12-30")
 def test_handle_batch_window():
@@ -66,7 +73,7 @@ def test_handle_batch_window():
             ]
         ],
         "ResponseMetadata": {"HTTPStatusCode": 200},
-        "statistics": {"recordsMatched": 10000}
+        "statistics": {"recordsMatched": 10000},
     }
     stubber.add_response(
         "start_query",
@@ -86,5 +93,7 @@ def test_handle_batch_window():
     )
     stubber.activate()
 
-    output = api.handle_batch_window(query_start, query_end, log_group, in_query)
-    assert response == output
+    query_id = api._start_query(query_start, query_end, log_group, in_query)
+    output = api._get_results(log_group, query_start, query_end, in_query, query_id)
+
+    assert response["results"] == output
diff --git a/tap_cloudwatch/tests/test_core.py b/tap_cloudwatch/tests/test_core.py
@@ -15,7 +15,7 @@
     "query": "fields @timestamp, @message",
     "aws_region_name": "us-east-1",
     "start_date": "2022-12-29",
-    "batch_increment_s": 86400
+    "batch_increment_s": 86400,
 }
 
 client = boto3.client("logs", region_name="us-east-1")
@@ -49,7 +49,7 @@ def test_standard_tap_tests(patch_client):
                 ]
             ],
             "ResponseMetadata": {"HTTPStatusCode": 200},
-            "statistics": {"recordsMatched": 0}
+            "statistics": {"recordsMatched": 0},
         },
         {"queryId": "123"},
     )

Original file line number	Diff line number	Diff line change
`@@ -29,5 +29,5 @@ def get_records(self, context: Optional[dict]) -> Iterable[dict]:`
`29`	`29`	`self.config.get("batch_increment_s"),`
`30`	`30`	`)`
`31`	`31`	`for batch in cloudwatch_iter:`
`32`		`- for record in batch.get("results"):`
	`32`	`+ for record in batch:`
`33`	`33`	`yield {i["field"][1:]: i["value"] for i in record}`
Original file line number	Diff line number	Diff line change
`@@ -23,16 +23,12 @@ def schema(self):`
`23`	`23`	`# \| parse @message "[] " as loggingType, loggingMessage`
`24`	`24`	`properties.append(`
`25`	`25`	`th.Property(`
`26`		`- "ptr",`
`27`		`- th.StringType(),`
`28`		`- description="The identifier for the log record."`
	`26`	`+ "ptr", th.StringType(), description="The identifier for the log record."`
`29`	`27`	`)`
`30`	`28`	`)`
`31`	`29`	`properties.append(`
`32`	`30`	`th.Property(`
`33`		`- "timestamp",`
`34`		`- th.DateTimeType(),`
`35`		`- description="The timestamp of the log."`
	`31`	`+ "timestamp", th.DateTimeType(), description="The timestamp of the log."`
`36`	`32`	`)`
`37`	`33`	`)`
`38`	`34`	`for prop in self.config.get("query").split("\|")[0].split(","):`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"query": "fields @timestamp, @message",`
`16`	`16`	`"aws_region_name": "us-east-1",`
`17`	`17`	`"start_date": "2022-12-29",`
`18`		`- "batch_increment_s": 86400`
	`18`	`+ "batch_increment_s": 86400,`
`19`	`19`	`}`
`20`	`20`
`21`	`21`	`client = boto3.client("logs", region_name="us-east-1")`
`@@ -49,7 +49,7 @@ def test_standard_tap_tests(patch_client):`
`49`	`49`	`]`
`50`	`50`	`],`
`51`	`51`	`"ResponseMetadata": {"HTTPStatusCode": 200},`
`52`		`- "statistics": {"recordsMatched": 0}`
	`52`	`+ "statistics": {"recordsMatched": 0},`
`53`	`53`	`},`
`54`	`54`	`{"queryId": "123"},`
`55`	`55`	`)`