Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions bin/results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
- Results details:

# A) events index write-only
Repo https://github.com/elastic/elasticsearch processed: time 00:04:43, events 85190
Repo https://github.com/elastic/elasticsearch processed: time 00:04:44, events 85190
Repo https://github.com/elastic/elasticsearch processed: time 00:04:28, events 85190

# B) events index write-only, lookups write-and-update
Repo https://github.com/elastic/elasticsearch processed: time 00:05:51, events 85190
Repo https://github.com/elastic/elasticsearch processed: time 00:05:27, events 85190
Repo https://github.com/elastic/elasticsearch processed: time 00:05:23, events 85190

# C) events index write-and-update (no items mod)
Repo https://github.com/elastic/elasticsearch processed: time 00:04:32, events 85190
Repo https://github.com/elastic/elasticsearch processed: time 00:04:30, events 85190
Repo https://github.com/elastic/elasticsearch processed: time 00:04:29, events 85190

# D) events index write-and-update (items mod)
Repo https://github.com/elastic/elasticsearch processed: time 00:05:06, events 85190
Repo https://github.com/elastic/elasticsearch processed: time 00:04:53, events 85190
Repo https://github.com/elastic/elasticsearch processed: time 00:04:46, events 85190


- Discussions:
4 tests have been conducted to evaluate approaches to save data to ElasticSearch, they are described below.

The approach A consists in writing Perceval items to an index, leaving ElasticSearch the responsability
to assign unique identifiers, thus Perceval items with the same `uuid` are indexed more than once.

The approach B extends the approach A by keeping the metadata information within a separated index in order
to know the latest time information (i.e., `metadata_timestamp`, `metadata_updated_on`) of a given Perceval item.
As can be seen from the results, this extra step decreases the performance of around 20%.

The approach C consists in writing Perceval items to an index, however the unique identifiers are set using
the Perceval `uuid` values. Thus, the operations performed on the index concern writing and updates, since
the same item may be retrieved several times. In this case the performance is better then approach A, since
the documents used to test this approach are not modified, thus it is possible that Lucene performs some
optimization and do notreindexed them again.

The approach D is similar to the approach C, but the documents are modified. As can be seen, this approach
performs worse than the approach A (around 6%)

114 changes: 114 additions & 0 deletions bin/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2019 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# Authors:
# Valerio Cosentino <[email protected]>


import itertools
import time

from perceval.backends.core.git import Git
from citadel.collections.events import (BY_HOUR,
Events)
from citadel.collections.lookups import Lookups


REPOS = [
# ("https://github.com/chaoss/grimoirelab-perceval", "/tmp/a"),
# ("https://github.com/chaoss/grimoirelab-toolkit", "/tmp/b"),
# ("https://github.com/chaoss/grimoirelab-elk", "/tmp/c"),
# ("https://github.com/chaoss/grimoirelab-sirmordred", "/tmp/d"),
# ("https://github.com/chaoss/grimoirelab-graal", "/tmp/e"),
("https://github.com/elastic/elasticsearch", "/tmp/f")
]

URL = 'https://admin:admin@localhost:9200'
LOOPS = 4


def test_git():

# clone repos
for url, local_path in REPOS:
msg = "Cloning repo {} at {}".format(url, local_path)
print(msg)

git = Git(url, local_path)
commits = [i for i in git.fetch()]

print(len(commits))

print("Start test with writes")
# writes
events = Events(URL, base_index='events_writes', timeframe=BY_HOUR)
lookups = Lookups(URL)
c = 0
while True:

for url, local_path in REPOS:

time_start = time.time()

git = Git(url, local_path)
items = git.fetch()
items, items2 = itertools.tee(git.fetch())

events_ = events.store(items, uuid=False)
lookups_ = lookups.store(items2)

spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start))
msg = "Repo {} processed: time {}, events {}".format(url, spent_time, events_)
print(msg)

c = c + 1

if c == LOOPS:
break

print("Start test with updates")
# updates
events = Events(URL, base_index='events_updates', timeframe=BY_HOUR)
c = 0
while True:

for url, local_path in REPOS:

time_start = time.time()

git = Git(url, local_path)
items = git.fetch()

events_ = events.store(items, field_id="uuid")

spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start))
msg = "Repo {} processed: time {}, events {}".format(url, spent_time, events_)
print(msg)

c = c + 1

if c == LOOPS:
break


def main():
test_git()


if __name__ == '__main__':
main()
Empty file added citadel/collections/__init__.py
Empty file.
127 changes: 127 additions & 0 deletions citadel/collections/events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2019 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Valerio Cosentino <[email protected]>
#


from grimoirelab_toolkit.datetime import datetime_utcnow


from citadel.errors import EventsError
from citadel.storage_engines.elasticsearch import ElasticsearchStorage


EVENTS = 'events'

BY_MINUTE = 'minute'
BY_HOUR = 'hour'
BY_DAY = 'day'
BY_MONTH = 'month'

TIMEFRAMES = [BY_MINUTE, BY_HOUR, BY_DAY, BY_MONTH]


class Events:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this called Events?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed this offline, before working on the PR.

It is called Events because we eventize the Perceval items. You can find more details in the description of approach A:

The approach A consists in writing Perceval items to an index, leaving ElasticSearch the responsability
to assign unique identifiers, thus Perceval items with the same `uuid` are indexed more than once.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know but the name does it make sense to me. Is this a list of events? It also shouldn't be in plural.


PERCEVAL_MAPPING = """
{
"mappings": {
"%s": {
"dynamic": false,
"properties": {
"backend_name" : {
"type" : "keyword"
},
"backend_version" : {
"type" : "keyword"
},
"category" : {
"type" : "keyword"
},
"classified_fields_filtered" : {
"type" : "keyword"
},
"data" : {
"properties":{}
},
"origin" : {
"type" : "keyword"
},
"perceval_version" : {
"type" : "keyword"
},
"tag" : {
"type" : "keyword"
},
"timestamp" : {
"type" : "long"
},
"updated_on" : {
"type" : "long"
},
"uuid" : {
"type" : "keyword"
}
}
}
}
}
""" % ElasticsearchStorage.ITEMS

def __init__(self, url, base_index=EVENTS, timeframe=BY_DAY):
self.storage = ElasticsearchStorage(url)

if timeframe not in TIMEFRAMES:
msg = "Unknown timeframe {}".format(timeframe)
raise EventsError(cause=msg)

self.timeframe = timeframe
self.base_index = base_index

def index_name(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is still ElasticSearch specific. At this level I think we should avoiding this. I think it should be an abstract class and each specific class will implement the methods needed to write and route where the items will be written.

If that creates a lot of leves of abstraction, all of this can be moved to the StorageEnging class. That would work too, but we have to take into account that should be independent and without public references to ES.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...I think it should be an abstract class and each specific class will implement the methods needed to write and route where the items will be written.

You have just described the StorageEngine class.

If that creates a lot of leves of abstraction, all of this can be moved to the StorageEnging class. That would work too, but we have to take into account that should be independent and without public references to ES.

  • Why we cannot build on top of the elasticsearch storage engine (which already hides some specificities of dealing with ES)?

  • Who is going to set the index name or you plan to create it in a total automatic way?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...I think it should be an abstract class and each specific class will implement the methods needed to write and route where the items will be written.

You have just described the StorageEngine class.

Not really. As StorageEngine is now, you have to be explicit saying where you want to store your items. I, as developer/user, don't want to decide this. The library should decide how to do this and what's the best way to do it.
I just want to store data and retrieve data.

If that creates a lot of leves of abstraction, all of this can be moved to the StorageEnging class. That would work too, but we have to take into account that should be independent and without public references to ES.

* Why we cannot build on top of the elasticsearch storage engine (which already hides some specificities of dealing with ES)?

We can build and we should. My point is I don't want to expose the concept on index, name selection an so on. The system should do that for me.

* Who is going to set the index name or you plan to create it in a total automatic way?

The library should do that. The developer/user can define a prefix if we want, but nothing else.

The idea about all of this is to have different levels of abstraction. The lower level should do what StorageEngine is doing now. An upper level, should take the items and decide where and how to store them. This level can be integrated in the StorageEngine, but I see two different levels of abstraction. Maybe the current StorageEngine should be something private, so each developer who wants a new system should create it by himself or herself.


def __set_timeframe_format():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This means we are storing that depending on when it was retrieved. Another possibility is to store data depending on when they were updated/created. That would spread the items among indexes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed this offline, before working on the PR.

Do you want to change the way of storing items? let me know and I'll change the code as you prefer.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm just wondering what solution will be the best. I like the idea you proposed of storing data as they were events. Taking that into account I'd like to understand which solution is better.

Your current solution is to store everything with the date we're getting it. That means we can have a huge index when we start analyzing a project. Later, indexes will be smaller. The good thing is it's fast to store data, and we can access fast to the information we stored at certain point of time.

The other solution is to store data when it was updated in the origin. The good thing is it makes searching faster within a date range. You can configure shards to give more resources and make fast searches to all those indexes within a range (for example, the last two years). It also follows better the approach to store events as they are gathered. The big problem is we need something to route items to their right index (we can have indexes per year and month like in gharchive) which makes the system slower when writing data.

Any solution will be fine. I just want to think the pros and cons before implementing it.

frmt = "%Y%m"
if self.timeframe == BY_MINUTE:
frmt = "%Y%m%d_%Hh%Mm"
if self.timeframe == BY_HOUR:
frmt = "%Y%m%d_%Hh"
elif self.timeframe == BY_DAY:
frmt = "%Y%m%d"

return frmt

timeframe_format = __set_timeframe_format()
timeframe = datetime_utcnow().replace(tzinfo=None).strftime(timeframe_format)

index_name = self.base_index + "_" + timeframe

return index_name

def store(self, data, field_id=None):

index = self.index_name()

if not self.storage.elasticsearch.indices.exists(index):
self.storage.create_index(index, self.PERCEVAL_MAPPING)
self.storage.set_alias(self.base_index, index)

written = self.storage.write(index, data, field_id=field_id)

return written
87 changes: 87 additions & 0 deletions citadel/collections/lookups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2019 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Valerio Cosentino <[email protected]>
#

from citadel.storage_engines.elasticsearch import ElasticsearchStorage


class Lookups:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the purpose of this class?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed this offline, before working on the PR.

The lookups index keeps the last value of the perceval items inserted in the events index. It is used in Approach B:

The approach B extends the approach A by keeping the metadata information within a separated index in order
to know the latest time information (i.e., `metadata_timestamp`, `metadata_updated_on`) of a given Perceval item.
As can be seen from the results, this extra step decreases the performance of around 20%.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought we weren't going to implement this in this step. Anyway, why this needs a class and why not integrate it with the StorageEngine?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I put Events and Lookups outside since they were more POC to evaluate different approaches


UUID_SEARCH = 'uuid'
LOOKUPS = 'lookups'
LOOKUPS_MAPPING = """
{
"mappings": {
"%s": {
"dynamic": false,
"_source": {
"excludes": [
"data"
]
},
"properties": {
"backend_name" : {
"type" : "keyword"
},
"backend_version" : {
"type" : "keyword"
},
"category" : {
"type" : "keyword"
},
"classified_fields_filtered" : {
"type" : "keyword"
},
"origin" : {
"type" : "keyword"
},
"perceval_version" : {
"type" : "keyword"
},
"tag" : {
"type" : "keyword"
},
"timestamp" : {
"type" : "long"
},
"updated_on" : {
"type" : "long"
},
"uuid" : {
"type" : "keyword"
}
}
}
}
}
""" % LOOKUPS

def __init__(self, url, index=LOOKUPS):
self.storage = ElasticsearchStorage(url)
self.index = index

def store(self, data):

if not self.storage.elasticsearch.indices.exists(self.index):
self.storage.create_index(self.index, self.LOOKUPS_MAPPING)

written = self.storage.write(self.index, data, item_type=self.LOOKUPS, field_id=self.UUID_SEARCH)

return written
6 changes: 6 additions & 0 deletions citadel/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,9 @@ class StorageEngineError(BaseError):
"""Generic error for storage engines"""

message = "%(cause)s"


class EventsError(BaseError):
"""Generic error for events"""

message = "%(cause)s"
5 changes: 4 additions & 1 deletion citadel/storage_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@ class StorageEngine:

This class allows to perform write operations on storage engines.
"""
def write(self, resource, data, chunk_size=None):
def write(self, resource, data, item_type, chunk_size=None, field_id=None):
"""Write method to be redefined by subclasses

:param resource: the location where the data is written
:param data: the data to be written
:param item_type: type of the item
:param chunk_size: size of data chunks
:param field_id: field representing the ID of the item. If None the
ID generation is delegated to the storage engine
"""
raise NotImplementedError
Loading