Skip to content

Commit f05ba82

Browse files
authored
Merge pull request #4 from fkrueger/master
added the superflous (and wrong) application/json Content-Type,…
2 parents b1ad60f + 06b6f28 commit f05ba82

2 files changed

Lines changed: 33 additions & 12 deletions

File tree

README.md

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,22 @@ A tool for removing duplicated documents that are grouped by some unique field (
77

88
Usage:
99
```
10-
python3 -u dedupe.py -H localhost -i 2017.03.17 -b 10000 -m 100 -a -f Uuid --prefix nginx_logs > es_dedupe.log
10+
python -u dedupe.py -H localhost -P 9200 -i exact-index-name -f Uuid > es_dedupe.log
1111
```
12-
will try to find duplicated documents in index called `nginx_logs-2017.03.17` where documents are grouped by `Uuid` field.
12+
will try to find duplicated documents in an index called 'exact-index-name' where documents are grouped by `Uuid` field.
1313

14-
* `-a` will process all indexes named with patterh `%Y.%m.%d` until today.
14+
```
15+
python -u dedupe.py -H localhost -P 9200 --all --prefix 'esindexprefix' --prefixseparator '-' --indexexclude '^excludedindex.*' -f fingerprint > es_dedupe.log
16+
```
17+
will try to find duplicated documents in all indices known to the ES instance on localhost:9200, that look akin to 'esindexprefix-\*' while excluding all indices starting with 'excludedindex', where documents are grouped by `fingerprint` field.
18+
19+
* `-a` will process all indexes known to the ES instance that match the prefix and prefixseparator.
1520
* `-b` batch size - critical for performance ES queries might take several minutes, depending on size of your indexes
1621
* `-f` name of field that should be unique
1722
* `-h` displays help
1823
* `-m` number of duplicated documents with same unique field value
1924
* `-t` document type in ES
20-
* `--sleep 60` time between aggregation requests (gives ES time to run GC on heap)
25+
* `--sleep 60` time between aggregation requests (gives ES time to run GC on heap), 15 seconds seems to be enough to avoid triggering ES flood protection though.
2126

2227
WARNING: Running huge bulk operations on ES cluster might influence performance of your cluster or even crash some nodes if heap
2328
is not large enough. Increment `-b` and `-m` parameters with caution! ES returns at most `b * m` documents, eventually you might hit
@@ -28,7 +33,7 @@ A log file containing documents with unique fields is written into `/tmp/es_dedu
2833
By design ES aggregate queries are not necessarily precise. Depending on your cluster setup, some documents won't be deleted due to
2934
inaccurate shard statistics.
3035

31-
Running `$ python3 dedupe.py --check_log /tmp/es_dedupe.log --noop` will query for documents found by aggregate and queries check whether were actually
36+
Running `$ python dedupe.py --check_log /tmp/es_dedupe.log --noop` will query for documents found by aggregate and queries check whether were actually
3237
deleted.
3338
```
3439
== Starting ES deduplicator....
@@ -56,11 +61,21 @@ Deleted 276673 duplicates, in total 609802. Batch processed in 0:00:08.487847, r
5661
```
5762

5863
## Requirements
64+
For the installation use the tools provided by your operating system.
65+
66+
On Linux this can be one of the following: yum, dnf, apt, yast, emerge, ..
67+
```
68+
* Install python (2 or 3, both will work)
69+
* Install python*ujson and python*requests for the fitting python version
70+
```
71+
72+
On Windows you are pretty much on your own, but fear not, you can do the following ;-)
5973
```
60-
apt install python3-dev
61-
pip3 install -r requirements.txt
74+
* Download and install a python version from https://www.python.org/ .
75+
* Open a console terminal and head to the repository copy of es-deduplicator, then run:
76+
pip install -r requirements.txt
6277
```
6378

6479
## History
6580

66-
Originaly written in bash which performed terribly due to slow JSON processing with pipes and `jq`. Python with `ujson` seems to be better fitted for this task.
81+
Originally written in bash which performed terribly due to slow JSON processing with pipes and `jq`. Python with `ujson` seems to be better fitted for this task.

dedupe.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333

3434
# out current scriptname (minus the path)
3535
ourname = os.path.basename(__file__)
36+
# At least Elasticsearch 6.2.2 does not support application/x-ndjson, but wants to enforce setting an explicit Content-Type. As to why Elastic wouldn't support this, I have no idea.
37+
es_headers = { 'Content-Type': 'application/json' }
38+
3639

3740

3841

@@ -204,14 +207,15 @@ def allsettings_uri(args):
204207

205208

206209
def fetch_indexlist(args):
210+
global es_headers
207211
uri = idxlist_uri(args)
208212
payload = {}
209213
try:
210214
json = ujson.dumps(payload)
211215
if args.verbose:
212216
logme("## GET {0}".format(uri))
213217
logme("##\tdata. {0}".format(json))
214-
resp = requests.get(uri, data=json)
218+
resp = requests.get(uri, data=json, headers=es_headers)
215219
if args.debug:
216220
logme("## resp: {0}".format(resp.text))
217221
if (resp.status_code == 200):
@@ -375,12 +379,13 @@ def bulk_remove(buf, args):
375379

376380

377381
def fetch_allsettings(args):
382+
global es_headers
378383
tmpidx2settings = {}
379384
try:
380385
uri = allsettings_uri(args)
381386
if args.verbose:
382387
logme("# GET {}".format(uri))
383-
resp = requests.get(uri, data={})
388+
resp = requests.get(uri, data={}, headers=es_headers)
384389
# {"indexname_109":{"settings":{"index":{"number_of_shards":"4","blocks":{"write":"false","metadata":"false","read":"false"},"provided_name":"indexname_109","creation_date":"1520121603118","analysis":{"analyzer":{"analyzer_keyword":{"filter":"lowercase","tokenizer":"keyword"}}},"number_of_replicas":"0","uuid":"some-uuid-really-now","version":{"created":"5060499"}}}}, ....}
385390
r = {}
386391
if args.debug:
@@ -490,6 +495,7 @@ def check_docs(file, args):
490495

491496

492497
def msearch(query, args, stats, docs):
498+
global es_headers
493499
cnt_deleted = 0
494500
try:
495501
uri = msearch_uri(args)
@@ -501,7 +507,7 @@ def msearch(query, args, stats, docs):
501507
to_del = StringIO()
502508
to_log = StringIO()
503509
while True:
504-
resp = requests.get(uri, data=query)
510+
resp = requests.get(uri, data=query, headers=es_headers)
505511
if args.debug:
506512
logme("## resp: {0}".format(resp.text))
507513
if (resp.status_code == 200):
@@ -599,7 +605,7 @@ def print_stats(msg, stats, args):
599605

600606
parser = argparse.ArgumentParser(description="Elasticsearch dupe deleter")
601607
parser.add_argument("-a", "--all",
602-
action="store_true", dest="all", default=False,
608+
action="store_true", dest="all", default=True,
603609
help="All indexes from given date till today")
604610
parser.add_argument("-b", "--batch",
605611
dest="batch", default=10, type=int,

0 commit comments

Comments
 (0)