Bitergia · valeriocos · Jul 5, 2019 · Jul 5, 2019 · Jul 5, 2019 · Jul 5, 2019
diff --git a/bin/results.txt b/bin/results.txt
@@ -0,0 +1,42 @@
+- Results details:
+
+# A) events index write-only
+Repo https://github.com/elastic/elasticsearch processed: time 00:04:43, events 85190
+Repo https://github.com/elastic/elasticsearch processed: time 00:04:44, events 85190
+Repo https://github.com/elastic/elasticsearch processed: time 00:04:28, events 85190
+
+# B) events index write-only, lookups write-and-update
+Repo https://github.com/elastic/elasticsearch processed: time 00:05:51, events 85190
+Repo https://github.com/elastic/elasticsearch processed: time 00:05:27, events 85190
+Repo https://github.com/elastic/elasticsearch processed: time 00:05:23, events 85190
+
+# C) events index write-and-update (no items mod)
+Repo https://github.com/elastic/elasticsearch processed: time 00:04:32, events 85190
+Repo https://github.com/elastic/elasticsearch processed: time 00:04:30, events 85190
+Repo https://github.com/elastic/elasticsearch processed: time 00:04:29, events 85190
+
+# D) events index write-and-update (items mod)
+Repo https://github.com/elastic/elasticsearch processed: time 00:05:06, events 85190
+Repo https://github.com/elastic/elasticsearch processed: time 00:04:53, events 85190
+Repo https://github.com/elastic/elasticsearch processed: time 00:04:46, events 85190
+
+
+- Discussions:
+4 tests have been conducted to evaluate approaches to save data to ElasticSearch, they are described below.
+
+The approach A consists in writing Perceval items to an index, leaving ElasticSearch the responsability
+to assign unique identifiers, thus Perceval items with the same `uuid` are indexed more than once.
+
+The approach B extends the approach A by keeping the metadata information within a separated index in order
+to know the latest time information (i.e., `metadata_timestamp`, `metadata_updated_on`) of a given Perceval item.
+As can be seen from the results, this extra step decreases the performance of around 20%.
+
+The approach C consists in writing Perceval items to an index, however the unique identifiers are set using
+the Perceval `uuid` values. Thus, the operations performed on the index concern writing and updates, since
+the same item may be retrieved several times. In this case the performance is better then approach A, since
+the documents used to test this approach are not modified, thus it is possible that Lucene performs some
+optimization and do notreindexed them again.
+
+The approach D is similar to the approach C, but the documents are modified. As can be seen, this approach
+performs worse than the approach A (around 6%)
+
diff --git a/bin/test.py b/bin/test.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2015-2019 Bitergia
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# Authors:
+#     Valerio Cosentino <[email protected]>
+
+
+import itertools
+import time
+
+from perceval.backends.core.git import Git
+from citadel.collections.events import (BY_HOUR,
+                                        Events)
+from citadel.collections.lookups import Lookups
+
+
+REPOS = [
+    # ("https://github.com/chaoss/grimoirelab-perceval", "/tmp/a"),
+    # ("https://github.com/chaoss/grimoirelab-toolkit", "/tmp/b"),
+    # ("https://github.com/chaoss/grimoirelab-elk", "/tmp/c"),
+    # ("https://github.com/chaoss/grimoirelab-sirmordred", "/tmp/d"),
+    # ("https://github.com/chaoss/grimoirelab-graal", "/tmp/e"),
+    ("https://github.com/elastic/elasticsearch", "/tmp/f")
+]
+
+URL = 'https://admin:admin@localhost:9200'
+LOOPS = 4
+
+
+def test_git():
+
+    # clone repos
+    for url, local_path in REPOS:
+        msg = "Cloning repo {} at {}".format(url, local_path)
+        print(msg)
+
+        git = Git(url, local_path)
+        commits = [i for i in git.fetch()]
+
+        print(len(commits))
+
+    print("Start test with writes")
+    # writes
+    events = Events(URL, base_index='events_writes', timeframe=BY_HOUR)
+    lookups = Lookups(URL)
+    c = 0
+    while True:
+
+        for url, local_path in REPOS:
+
+            time_start = time.time()
+
+            git = Git(url, local_path)
+            items = git.fetch()
+            items, items2 = itertools.tee(git.fetch())
+
+            events_ = events.store(items, uuid=False)
+            lookups_ = lookups.store(items2)
+
+            spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start))
+            msg = "Repo {} processed: time {}, events {}".format(url, spent_time, events_)
+            print(msg)
+
+        c = c + 1
+
+        if c == LOOPS:
+            break
+
+    print("Start test with updates")
+    # updates
+    events = Events(URL, base_index='events_updates', timeframe=BY_HOUR)
+    c = 0
+    while True:
+
+        for url, local_path in REPOS:
+
+            time_start = time.time()
+
+            git = Git(url, local_path)
+            items = git.fetch()
+
+            events_ = events.store(items, field_id="uuid")
+
+            spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start))
+            msg = "Repo {} processed: time {}, events {}".format(url, spent_time, events_)
+            print(msg)
+
+        c = c + 1
+
+        if c == LOOPS:
+            break
+
+
+def main():
+    test_git()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/citadel/collections/__init__.py b/citadel/collections/__init__.py
diff --git a/citadel/collections/events.py b/citadel/collections/events.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2015-2019 Bitergia
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# Authors:
+#     Valerio Cosentino <[email protected]>
+#
+
+
+from grimoirelab_toolkit.datetime import datetime_utcnow
+
+
+from citadel.errors import EventsError
+from citadel.storage_engines.elasticsearch import ElasticsearchStorage
+
+
+EVENTS = 'events'
+
+BY_MINUTE = 'minute'
+BY_HOUR = 'hour'
+BY_DAY = 'day'
+BY_MONTH = 'month'
+
+TIMEFRAMES = [BY_MINUTE, BY_HOUR, BY_DAY, BY_MONTH]
+
+
+class Events:
+
+    PERCEVAL_MAPPING = """
+        {
+          "mappings": {
+            "%s": {
+                "dynamic": false,
+                "properties": {
+                    "backend_name" : {
+                        "type" : "keyword"
+                    },
+                    "backend_version" : {
+                        "type" : "keyword"
+                    },
+                    "category" : {
+                        "type" : "keyword"
+                    },
+                    "classified_fields_filtered" : {
+                        "type" : "keyword"
+                    },
+                    "data" : {
+                        "properties":{}
+                    },
+                    "origin" : {
+                        "type" : "keyword"
+                    },
+                    "perceval_version" : {
+                        "type" : "keyword"
+                    },
+                    "tag" : {
+                        "type" : "keyword"
+                    },
+                    "timestamp" : {
+                        "type" : "long"
+                    },
+                    "updated_on" : {
+                        "type" : "long"
+                    },
+                    "uuid" : {
+                        "type" : "keyword"
+                    }
+                }
+            }
+          }
+        }
+        """ % ElasticsearchStorage.ITEMS
+
+    def __init__(self, url, base_index=EVENTS, timeframe=BY_DAY):
+        self.storage = ElasticsearchStorage(url)
+
+        if timeframe not in TIMEFRAMES:
+            msg = "Unknown timeframe {}".format(timeframe)
+            raise EventsError(cause=msg)
+
+        self.timeframe = timeframe
+        self.base_index = base_index
+
+    def index_name(self):
+
+        def __set_timeframe_format():
+            frmt = "%Y%m"
+            if self.timeframe == BY_MINUTE:
+                frmt = "%Y%m%d_%Hh%Mm"
+            if self.timeframe == BY_HOUR:
+                frmt = "%Y%m%d_%Hh"
+            elif self.timeframe == BY_DAY:
+                frmt = "%Y%m%d"
+
+            return frmt
+
+        timeframe_format = __set_timeframe_format()
+        timeframe = datetime_utcnow().replace(tzinfo=None).strftime(timeframe_format)
+
+        index_name = self.base_index + "_" + timeframe
+
+        return index_name
+
+    def store(self, data, field_id=None):
+
+        index = self.index_name()
+
+        if not self.storage.elasticsearch.indices.exists(index):
+            self.storage.create_index(index, self.PERCEVAL_MAPPING)
+            self.storage.set_alias(self.base_index, index)
+
+        written = self.storage.write(index, data, field_id=field_id)
+
+        return written
diff --git a/citadel/collections/lookups.py b/citadel/collections/lookups.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2015-2019 Bitergia
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# Authors:
+#     Valerio Cosentino <[email protected]>
+#
+
+from citadel.storage_engines.elasticsearch import ElasticsearchStorage
+
+
+class Lookups:
+
+    UUID_SEARCH = 'uuid'
+    LOOKUPS = 'lookups'
+    LOOKUPS_MAPPING = """
+        {
+          "mappings": {
+            "%s": {
+                "dynamic": false,
+                "_source": {
+                    "excludes": [
+                        "data"
+                    ]
+                },
+                "properties": {
+                    "backend_name" : {
+                        "type" : "keyword"
+                    },
+                    "backend_version" : {
+                        "type" : "keyword"
+                    },
+                    "category" : {
+                        "type" : "keyword"
+                    },
+                    "classified_fields_filtered" : {
+                        "type" : "keyword"
+                    },
+                    "origin" : {
+                        "type" : "keyword"
+                    },
+                    "perceval_version" : {
+                        "type" : "keyword"
+                    },
+                    "tag" : {
+                        "type" : "keyword"
+                    },
+                    "timestamp" : {
+                        "type" : "long"
+                    },
+                    "updated_on" : {
+                        "type" : "long"
+                    },
+                    "uuid" : {
+                        "type" : "keyword"
+                    }
+                }
+            }
+          }
+        }
+        """ % LOOKUPS
+
+    def __init__(self, url, index=LOOKUPS):
+        self.storage = ElasticsearchStorage(url)
+        self.index = index
+
+    def store(self, data):
+
+        if not self.storage.elasticsearch.indices.exists(self.index):
+            self.storage.create_index(self.index, self.LOOKUPS_MAPPING)
+
+        written = self.storage.write(self.index, data, item_type=self.LOOKUPS, field_id=self.UUID_SEARCH)
+
+        return written
diff --git a/citadel/errors.py b/citadel/errors.py
@@ -40,3 +40,9 @@ class StorageEngineError(BaseError):
     """Generic error for storage engines"""
 
     message = "%(cause)s"
+
+
+class EventsError(BaseError):
+    """Generic error for events"""
+
+    message = "%(cause)s"
diff --git a/citadel/storage_engine.py b/citadel/storage_engine.py
@@ -25,11 +25,14 @@ class StorageEngine:
 
     This class allows to perform write operations on storage engines.
     """
-    def write(self, resource, data, chunk_size=None):
+    def write(self, resource, data, item_type, chunk_size=None, field_id=None):
         """Write method to be redefined by subclasses
 
         :param resource: the location where the data is written
         :param data: the data to be written
+        :param item_type: type of the item
         :param chunk_size: size of data chunks
+        :param field_id: field representing the ID of the item. If None the
+            ID generation is delegated to the storage engine
         """
         raise NotImplementedError