Skip to content

Commit 64d6b8e

Browse files
committed
index english_wordnet; fix ru(s) metadata; rebuild data index
1 parent 6f9d40e commit 64d6b8e

File tree

7 files changed

+18
-15
lines changed

7 files changed

+18
-15
lines changed

Makefile

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@ BASEURL = https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages
44
pkg_index:
55
$(PYTHON) tools/build_collections.py .
66
$(PYTHON) tools/build_pkg_index.py . $(BASEURL) index.xml
7-
git add collections
8-
git add index.xml
9-
git commit -m "updated data index"
7+
git add index.xml collections
108

119
grammars:
1210
git commit -m "updated grammar files" packages/grammars

collections/all-corpora.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
<item ref="crubadan" />
1919
<item ref="dependency_treebank" />
2020
<item ref="dolch" />
21+
<item ref="english_wordnet" />
2122
<item ref="europarl_raw" />
2223
<item ref="extended_omw" />
2324
<item ref="floresta" />

collections/all-nltk.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
<item ref="crubadan" />
2626
<item ref="dependency_treebank" />
2727
<item ref="dolch" />
28+
<item ref="english_wordnet" />
2829
<item ref="europarl_raw" />
2930
<item ref="extended_omw" />
3031
<item ref="floresta" />

collections/all.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
<item ref="crubadan" />
2626
<item ref="dependency_treebank" />
2727
<item ref="dolch" />
28+
<item ref="english_wordnet" />
2829
<item ref="europarl_raw" />
2930
<item ref="extended_omw" />
3031
<item ref="floresta" />

index.xml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
<packages>
55
<package id="abc" name="Australian Broadcasting Commission 2006" webpage="http://www.abc.net.au/" author="Australian Broadcasting Commission" unzip="1" unzipped_size="4054966" size="1487851" checksum="ffb36b67ff24cbf7daaf171c897eb904" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/abc.zip" />
66
<package id="alpino" name="Alpino Dutch Treebank" webpage="http://www.let.rug.nl/~vannoord/trees/" contact="Gertjan van Noord" license="Distributed with permission of Gertjan van Noord" unzip="1" unzipped_size="21604821" size="2797255" checksum="ae529a1c5f13d6074f5b0d68d8edb537" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/alpino.zip" />
7-
<package id="averaged_perceptron_tagger" name="Averaged Perceptron Tagger" languages="English" unzip="1" unzipped_size="6138625" size="2526731" checksum="05c91d607ee1043181233365b3f76978" subdir="taggers" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip" />
8-
<package id="averaged_perceptron_tagger_eng" name="Averaged Perceptron Tagger (JSON)" languages="English" unzip="1" unzipped_size="5703817" size="1539115" checksum="729e2255f83045670374180de9bdb613" subdir="taggers" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger_eng.zip" />
9-
<package id="averaged_perceptron_tagger_ru" name="Averaged Perceptron Tagger (Russian)" webpage="http://www.ruscorpora.ru/en/" languages="Russian" unzip="1" unzipped_size="23247411" size="8628828" checksum="f7051368e4aff6718f8b38c1362dfdb1" subdir="taggers" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger_ru.zip" />
10-
<package id="averaged_perceptron_tagger_rus" name="Averaged Perceptron Tagger (Russian)" webpage="http://www.ruscorpora.ru/en/" languages="Russian" unzip="1" unzipped_size="30246815" size="5997187" checksum="073f704b73bf8d88037e464852e34420" subdir="taggers" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger_rus.zip" />
7+
<package id="averaged_perceptron_tagger" name="Averaged Perceptron Tagger" languages="English" license="MIT License" webpage="https://github.com/sloria/textblob-aptagger" unzip="1" unzipped_size="6138625" size="2526731" checksum="05c91d607ee1043181233365b3f76978" subdir="taggers" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip" />
8+
<package id="averaged_perceptron_tagger_eng" name="Averaged Perceptron Tagger (JSON)" languages="English" license="MIT License" webpage="https://github.com/sloria/textblob-aptagger" unzip="1" unzipped_size="5703817" size="1539115" checksum="729e2255f83045670374180de9bdb613" subdir="taggers" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger_eng.zip" />
9+
<package id="averaged_perceptron_tagger_ru" name="Averaged Perceptron Tagger (Russian)" webpage="http://www.ruscorpora.ru/en/" languages="Russian" license="MIT License" unzip="1" unzipped_size="23247411" size="8628828" checksum="f7051368e4aff6718f8b38c1362dfdb1" subdir="taggers" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger_ru.zip" />
10+
<package id="averaged_perceptron_tagger_rus" name="Averaged Perceptron Tagger (Russian)" webpage="http://www.ruscorpora.ru/en/" languages="Russian" license="MIT License" unzip="1" unzipped_size="30246815" size="5997187" checksum="073f704b73bf8d88037e464852e34420" subdir="taggers" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger_rus.zip" />
1111
<package id="basque_grammars" name="Grammars for Basque" author="Kepa Sarasola" languages="Spanish" unzip="1" unzipped_size="5550" size="4704" checksum="0e3518cb2aeb2600cb2841df7f035606" subdir="grammars" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/grammars/basque_grammars.zip" />
1212
<package id="bcp47" name="BCP-47 Language Tags" license="IETF Trust and Unicode Inc." copyright="Copyright (c) 2022 IETF Trust and Copyright (c) 1991-2022 Unicode" webpage="https://www.rfc-editor.org/rfc/rfc5646.html" unzip="0" unzipped_size="1433135" size="222952" checksum="8ef6c0dfa7661e3338dd99c495a7d9b6" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/bcp47.zip" />
1313
<package id="biocreative_ppi" name="BioCreAtIvE (Critical Assessment of Information Extraction Systems in Biology)" webpage="http://www.mitre.org/public/biocreative/" copyright="Public Domain (not copyrighted)" license="Public Domain" unzip="1" unzipped_size="1537086" size="223566" checksum="d3be36b53ab201372f1cd63ffc75e9a9" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/biocreative_ppi.zip" />
@@ -28,6 +28,7 @@
2828
<package id="crubadan" name="Crubadan Corpus" copyright="Copyright (C) 2010 Kevin Scannell" author="Kevin Scannell" license="GPLv3" webpage="http://borel.slu.edu/crubadan/" unzip="1" unzipped_size="11256183" size="5288655" checksum="3cc831382dec41b8d9a06d93ef300352" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/crubadan.zip" />
2929
<package id="dependency_treebank" name="Dependency Parsed Treebank" sample="True" copyright="Copyright (C) 1995 University of Pennsylvania" license="This is a 10% fragment of Penn Treebank, (C) LDC 1995, which has been dependency parsed. It is made available under fair use for the purposes of illustrating NLTK tools for tokenizing, tagging, chunking and parsing. This data is for non-commercial use only." unzip="1" unzipped_size="1069540" size="457429" checksum="631e959acaa42eea718daf04c5cdfa76" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip" />
3030
<package id="dolch" name="Dolch Word List" webpage="https://en.wikipedia.org/wiki/Dolch_word_list" unzip="1" unzipped_size="1917" size="2116" checksum="6f9c042774b96366c93fd0f9a9adb697" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dolch.zip" />
31+
<package id="english_wordnet" name="Open English Wordnet" version="2024" license="This resource is derived from Princeton WordNet under the WordNet License and further developed under the Creative Commons Attribution 4.0 International License. You may share and adapt this resource providing attribution is given to both Princeton WordNet and the Open English WordNet team." copyright="Open English Wordnet 2024 Copyright 2024 by the Open English Wordnet team. WordNet 3.1 Copyright 2011 by Princeton University. All rights reserved." webpage="https://en-word.net/" unzip="1" unzipped_size="38860076" size="11458203" checksum="a82dfe03cab9f6c9d85f8da8de88613b" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/english_wordnet.zip" />
3132
<package id="europarl_raw" name="Sample European Parliament Proceedings Parallel Corpus" author="Philipp Koehn, University of Edinburgh" webpage="http://www.statmt.org/europarl" unzip="1" unzipped_size="41396100" size="12594977" checksum="7621d5675990b1decc012c823716ee76" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/europarl_raw.zip" />
3233
<package id="extended_omw" name="Extended Open Multilingual WordNet" copyright="Copyright (C) 2013 Francis Bond and Ryan Foster" license="CC by SA 3.0 Licence (for data from Wikitionary) and Unicode, Inc. Licence Agreement (for data from CLDR)" webpage="http://compling.hss.ntu.edu.sg/omw/summx.html" unzip="0" unzipped_size="36087752" size="11251284" checksum="8cc3931b20fdc2a2fe1ed9d42567d51b" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/extended_omw.zip" />
3334
<package id="floresta" name="Portuguese Treebank" license="Non-commercial use only" webpage="http://www.linguateca.pt/Floresta/" unzip="1" unzipped_size="16414136" size="1882021" checksum="de5f1df09949f080e0f616f0bc55967d" subdir="corpora" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/floresta.zip" />
@@ -150,6 +151,7 @@
150151
<item ref="crubadan" />
151152
<item ref="dependency_treebank" />
152153
<item ref="dolch" />
154+
<item ref="english_wordnet" />
153155
<item ref="europarl_raw" />
154156
<item ref="extended_omw" />
155157
<item ref="floresta" />
@@ -264,6 +266,7 @@
264266
<item ref="crubadan" />
265267
<item ref="dependency_treebank" />
266268
<item ref="dolch" />
269+
<item ref="english_wordnet" />
267270
<item ref="europarl_raw" />
268271
<item ref="extended_omw" />
269272
<item ref="floresta" />
@@ -364,6 +367,7 @@
364367
<item ref="crubadan" />
365368
<item ref="dependency_treebank" />
366369
<item ref="dolch" />
370+
<item ref="english_wordnet" />
367371
<item ref="europarl_raw" />
368372
<item ref="extended_omw" />
369373
<item ref="floresta" />
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
<package id='averaged_perceptron_tagger_ru'
2-
name='Averaged Perceptron Tagger (Russian)'
3-
webpage='http://www.ruscorpora.ru/en/'
1+
<package id="averaged_perceptron_tagger_ru"
2+
name="Averaged Perceptron Tagger (Russian)"
3+
webpage="http://www.ruscorpora.ru/en/"
44
languages="Russian"
55
license="MIT License"
6-
webpage="https://github.com/sloria/textblob-aptagger"
76
unzip="1"
87
/>
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
<package id='averaged_perceptron_tagger_rus'
2-
name='Averaged Perceptron Tagger (Russian)'
3-
webpage='http://www.ruscorpora.ru/en/'
1+
<package id="averaged_perceptron_tagger_rus"
2+
name="Averaged Perceptron Tagger (Russian)"
3+
webpage="http://www.ruscorpora.ru/en/"
44
languages="Russian"
55
license="MIT License"
6-
webpage="https://github.com/sloria/textblob-aptagger"
76
unzip="1"
87
/>

0 commit comments

Comments
 (0)