diff --git a/.circleci/config.yml b/.circleci/config.yml index f645f6101..bcf9b6297 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,61 +1,19 @@ -version: 2 +version: 2.1 + +orbs: + python: circleci/python@0.2.1 + jobs: - test_api_and_main_and_upload: - docker: - - image: circleci/python + build-and-test: + executor: python/default steps: - checkout - - run: - name: init .pypirc - command: | - echo -e "[pypi]" >> ~/.pypirc - - run: - name: install requirements - command: | - sudo apt-get install -y libblas3 liblapack3 - sudo apt-get install -y liblapack-dev libblas-dev - cd /home/circleci/project/ - pip3 install --user -r requirements.txt - - run: - name: test main - command: | - cd /home/circleci/project/ - python3 tests/main_tests.py - - run: - name: test api - command: | - cd /home/circleci/project/ - python3 tests/api_tests.py - - run: - name: create packages - command: | - python3 setup.py sdist - python3 setup.py bdist_wheel - - run: - name: upload to pypi - command: | - if [[ "$PYPI_USERNAME" == "" ]]; then - echo "Skip upload" - exit 0 - fi - python3 -m pip install --user jq - if [[ "$CIRCLE_BRANCH" == "master" ]]; then - PYPI="pypi.org" - else - PYPI="test.pypi.org" - fi - LATEST_VERSION="$(curl -s https://$PYPI/pypi/nlpcube/json | jq -r '.info.version')" - THIS_VERSION=`python3 <<< "import pkg_resources;print(pkg_resources.require('nlpcube')[0].version)"` - if [[ $THIS_VERSION != $LATEST_VERSION ]]; then - echo "\n\nthis: $THIS_VERSION - latest: $LATEST_VERSION => releasing to $PYPI\n\n" - python3 -m pip install --user --upgrade twine - python3 -m twine upload --repository-url https://$PYPI/legacy/ dist/* -u $PYPI_USERNAME -p $PYPI_PASSWORD || echo "Package already exists" - else - echo "this: $THIS_VERSION = latest: $LATEST_VERSION => skip release" - fi + - python/load-cache + - python/install-deps + - python/save-cache + - run: echo "done" workflows: - version: 2 - test_api_and_main_and_upload: + main: jobs: - - test_api_and_main_and_upload + - build-and-test diff --git a/.gitignore b/.gitignore index 118fc4c4e..ff06ff65f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,15 @@ +.DS_Store +Languasito/data/ +*.txt +lightning_logs +*.gz +*.encodings +*.npy +data/* +nlp-cube-models/* +corpus/ +models/ +scripts/packer *.pyc build/ dist/ @@ -11,12 +23,14 @@ cube/venv/* .idea/* venv/* cube/*.py +*.json -models/ +scratch/ tests/scratch/* scripts/*.json scripts/*.conllu scripts/*.md +scripts/wikiextractor.py # Jupyter notebooks notebooks/.ipynb_checkpoints/* diff --git a/Languasito/.idea/.gitignore b/Languasito/.idea/.gitignore new file mode 100644 index 000000000..73f69e095 --- /dev/null +++ b/Languasito/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/cube/.idea/cube.iml b/Languasito/.idea/Languasito.iml similarity index 50% rename from cube/.idea/cube.iml rename to Languasito/.idea/Languasito.iml index 2b8555738..1cf801cda 100644 --- a/cube/.idea/cube.iml +++ b/Languasito/.idea/Languasito.iml @@ -2,12 +2,9 @@ - + - + - - \ No newline at end of file diff --git a/Languasito/.idea/inspectionProfiles/Project_Default.xml b/Languasito/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 000000000..53bc7d72c --- /dev/null +++ b/Languasito/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,47 @@ + + + + \ No newline at end of file diff --git a/Languasito/.idea/inspectionProfiles/profiles_settings.xml b/Languasito/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 000000000..105ce2da2 --- /dev/null +++ b/Languasito/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/cube/.idea/misc.xml b/Languasito/.idea/misc.xml similarity index 68% rename from cube/.idea/misc.xml rename to Languasito/.idea/misc.xml index b80ab027e..0164e1770 100644 --- a/cube/.idea/misc.xml +++ b/Languasito/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/cube/.idea/modules.xml b/Languasito/.idea/modules.xml similarity index 57% rename from cube/.idea/modules.xml rename to Languasito/.idea/modules.xml index c112f1de4..8e0178c2b 100644 --- a/cube/.idea/modules.xml +++ b/Languasito/.idea/modules.xml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/Languasito/.idea/other.xml b/Languasito/.idea/other.xml new file mode 100644 index 000000000..a708ec781 --- /dev/null +++ b/Languasito/.idea/other.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/cube/generic_networks/__init__.py b/Languasito/languasito/__init__.py similarity index 100% rename from cube/generic_networks/__init__.py rename to Languasito/languasito/__init__.py diff --git a/Languasito/languasito/api.py b/Languasito/languasito/api.py new file mode 100644 index 000000000..38fa22916 --- /dev/null +++ b/Languasito/languasito/api.py @@ -0,0 +1,63 @@ +import sys +import torch +from typing import * + +sys.path.append('') + +from languasito.model import Languasito +from languasito.utils import LanguasitoCollate +from languasito.utils import Encodings + + +class LanguasitoAPI: + + def __init__(self, languasito: Languasito, encodings: Encodings): + self._languasito = languasito + self._languasito.eval() + self._encodings = encodings + self._collate = LanguasitoCollate(encodings, live=True) + self._device = 'cpu' + + def to(self, device: str): + self._languasito.to(device) + self._device = device + + def __call__(self, batch): + with torch.no_grad(): + x = self._collate.collate_fn(batch) + for key in x: + if isinstance(x[key], torch.Tensor): + x[key] = x[key].to(self._device) + rez = self._languasito(x) + emb = [] + pred_emb = rez['emb'].detach().cpu().numpy() + for ii in range(len(batch)): + c_emb = [] + for jj in range(len(batch[ii])): + c_emb.append(pred_emb[ii, jj]) + emb.append(c_emb) + return emb + + @staticmethod + def load(model_name: str): + from pathlib import Path + home = str(Path.home()) + filename = '{0}/.languasito/{1}'.format(home, model_name) + import os + if os.path.exists(filename + '.encodings'): + return LanguasitoAPI.load_local(filename) + else: + print("UserWarning: Model not found and automatic downloading is not yet supported") + return None + + @staticmethod + def load_local(model_name: str): + enc = Encodings() + enc.load('{0}.encodings'.format(model_name)) + model = Languasito(enc) + tmp = torch.load('{0}.best'.format(model_name), map_location='cpu') + # model.load(tmp['state_dict']) + model.load_state_dict(tmp['state_dict']) + model.eval() + api = LanguasitoAPI(model, enc) + return api diff --git a/Languasito/languasito/model.py b/Languasito/languasito/model.py new file mode 100644 index 000000000..47dcc806a --- /dev/null +++ b/Languasito/languasito/model.py @@ -0,0 +1,221 @@ +import sys + +sys.path.append('') +import pytorch_lightning as pl +import torch +import torch.nn as nn +from typing import * +import numpy as np +import random + +from languasito.utils import Encodings, mask_concat +from languasito.modules import WordGram, LinearNorm, CosineLoss, WordDecoder + + +class Languasito(pl.LightningModule): + def __init__(self, encodings: Encodings): + super().__init__() + NUM_FILTERS = 512 + RNN_SIZE = 256 + CHAR_EMB_SIZE = 128 + ATT_DIM = 64 + NUM_HEADS = 8 + + self._wg = WordGram(len(encodings.char2int), num_langs=1, num_filters=512, num_layers=5) + self._rnn_fw = nn.LSTM(NUM_FILTERS // 2, RNN_SIZE, num_layers=3, batch_first=True, bidirectional=False) + self._rnn_bw = nn.LSTM(NUM_FILTERS // 2, RNN_SIZE, num_layers=3, batch_first=True, bidirectional=False) + self._linear_out = LinearNorm(RNN_SIZE * 2, NUM_FILTERS // 2) + self._early_stop_meta_val = 0 + self._res = {"b_loss": 9999} + self._start_stop = nn.Embedding(2, NUM_FILTERS // 2) + self._epoch_results = None + self._loss_function = nn.CrossEntropyLoss(ignore_index=0) + self._repr1_ff = nn.Sequential(nn.LayerNorm(RNN_SIZE), nn.Linear(RNN_SIZE, NUM_FILTERS), nn.ReLU(), + nn.LayerNorm(NUM_FILTERS), nn.Linear(NUM_FILTERS, NUM_FILTERS), nn.ReLU()) + self._repr2_ff = nn.Sequential(nn.LayerNorm(ATT_DIM * NUM_HEADS), nn.Linear(ATT_DIM * NUM_HEADS, NUM_FILTERS), + nn.ReLU(), + nn.LayerNorm(NUM_FILTERS), nn.Linear(NUM_FILTERS, NUM_FILTERS), nn.ReLU()) + self._key = nn.Sequential(nn.Linear(RNN_SIZE, ATT_DIM), nn.Tanh()) + self._value = nn.Sequential(nn.Linear(RNN_SIZE, ATT_DIM), nn.Tanh()) + self._att_fn_fw = nn.MultiheadAttention(RNN_SIZE, NUM_HEADS, kdim=ATT_DIM, vdim=ATT_DIM) + self._att_fn_bw = nn.MultiheadAttention(RNN_SIZE, NUM_HEADS, kdim=ATT_DIM, vdim=ATT_DIM) + cond_size = NUM_FILTERS + self._word_reconstruct = WordDecoder(cond_size, CHAR_EMB_SIZE, len(encodings.word_decomposer._tok2int) + 4) + self._cosine_loss = CosineLoss() + + def forward(self, X, return_w=False, imagine=False): + x_words_chars = X['x_word_char'] + x_words_case = X['x_word_case'] + x_lang_word = X['x_lang_word'] + x_sent_len = X['x_sent_len'] + x_word_len = X['x_word_len'] + x_word_masks = X['x_word_masks'] + x_max_len = X['x_max_len'] + char_emb_packed = self._wg(x_words_chars, x_words_case, x_lang_word, x_word_masks, x_word_len) + + blist_char = [] + + sl = x_sent_len.cpu().numpy() + pos = 0 + for ii in range(x_sent_len.shape[0]): + slist_char = [] + slist_char.append( + self._start_stop(torch.zeros((1), dtype=torch.long, device=self._get_device()))) + for jj in range(sl[ii]): + slist_char.append(char_emb_packed[pos, :].unsqueeze(0)) + pos += 1 + + slist_char.append( + self._start_stop(torch.ones((1), dtype=torch.long, device=self._get_device()))) + + for jj in range(x_max_len - sl[ii]): + slist_char.append(torch.zeros((1, 512 // 2), + device=self._get_device(), dtype=torch.float)) + + sent_emb = torch.cat(slist_char, dim=0) + blist_char.append(sent_emb.unsqueeze(0)) + + char_emb = torch.cat(blist_char, dim=0) + out_fw, _ = self._rnn_fw(char_emb) + out_bw, _ = self._rnn_bw(torch.flip(char_emb, [1])) + out_bw = torch.flip(out_bw, [1]) + lexical = char_emb[:, 1:-1, :] + out_fw = out_fw + char_emb + out_bw = out_bw + char_emb + pre_context = torch.cat([out_fw[:, :-2, :], out_bw[:, 2:, :]], dim=-1) + context = torch.tanh(self._linear_out(pre_context)) + + concat = torch.cat([lexical, context, pre_context], dim=-1) + + y = {'lexical': lexical, 'context': context, 'emb': concat} # , 'sent': sent} + + if return_w: + att_value = self._apply_masked_attention(out_fw[:, :-2, :], out_bw[:, 2:, :]) + # att_value = torch.zeros_like(att_value) + # context = torch.zeros_like(context) + # repr1 = self._repr1_ff(context) + repr2 = self._repr2_ff(att_value) + # cond = torch.cat([repr1, repr2], dim=-1) + # cond = mask_concat([repr1, repr2], 0.33, self.training, self._get_device()) + cond = repr2 + + # cond = repr1 + cond_packed = [] + for ii in range(x_sent_len.shape[0]): + for jj in range(x_sent_len[ii]): + cond_packed.append(cond[ii, jj].unsqueeze(0)) + cond_packed = torch.cat(cond_packed, dim=0) + if imagine: + x_char_pred = self._word_reconstruct(cond_packed, gs_chars=None) + else: + x_char_pred = self._word_reconstruct(cond_packed, gs_chars=X['x_word_targets']) + y['x_char_pred'] = x_char_pred + + return y + + def _apply_masked_attention(self, fw, bw): + # forward + att_query = fw + att_mask = np.ones((fw.shape[1], fw.shape[1]), dtype=np.float) + for ii in range(fw.shape[1]): + for jj in range(ii + 1, fw.shape[1]): + att_mask[ii, jj] = 0 + att_mask = torch.tensor(att_mask, device=self._get_device()) + att_key = self._key(fw) + att_val = self._value(fw) + att_mask = att_mask.float().masked_fill(att_mask == 0, float('-inf')).masked_fill(att_mask == 1, float(0.0)) + att_value_fw, _ = self._att_fn_fw(att_query.permute(1, 0, 2), att_key.permute(1, 0, 2), + att_val.permute(1, 0, 2), attn_mask=att_mask) + + att_query = bw + att_mask = np.ones((bw.shape[1], bw.shape[1]), dtype=np.float) + for ii in range(bw.shape[1]): + for jj in range(0, ii): + att_mask[ii, jj] = 0 + att_mask = torch.tensor(att_mask, device=self._get_device()) + att_mask = att_mask.float().masked_fill(att_mask == 0, float('-inf')).masked_fill(att_mask == 1, float(0.0)) + att_key = self._key(bw) + att_val = self._value(bw) + + att_value_bw, _ = self._att_fn_bw(att_query.permute(1, 0, 2), att_key.permute(1, 0, 2), + att_val.permute(1, 0, 2), attn_mask=att_mask) + + return torch.cat([att_value_fw, att_value_bw], dim=-1).permute(1, 0, 2) + + def training_step(self, batch, batch_idx): + Y = self.forward(batch, return_w=True) + x_char_target = batch['x_word_targets'][:, 1:] + x_char_pred = Y['x_char_pred'] + loss_rec = self._loss_function(x_char_pred.reshape(-1, x_char_pred.shape[2]), x_char_target.reshape(-1)) + # y_lexical = Y['lexical'] + # y_context = Y['context'] + # sl = batch['x_sent_len'].detach().cpu().numpy() + # word_repr = [] + # sent_repr = y_sent + # for ii in range(sl.shape[0]): + # for jj in range(sl[ii]): + # if True: # random.random() < 0.15: + # word_repr.append(y_lexical[ii, jj].unsqueeze(0)) + # word_repr.append(y_context[ii, jj].unsqueeze(0)) + # + # word_repr = torch.cat(word_repr, dim=0) + # word_repr = word_repr.reshape(-1, 2, word_repr.shape[1]) + # loss_cosine = self._cosine_loss(word_repr) + # + # # sent_repr = sent_repr.reshape(-1, 2, sent_repr.shape[1]) + # # loss_sent = self._ge2e_sent(sent_repr) + return loss_rec # + loss_cosine + + def validation_step(self, batch, batch_idx): + Y = self.forward(batch, return_w=True) + x_char_target = batch['x_word_targets'][:, 1:] + x_char_pred = Y['x_char_pred'] + + loss_rec = self._loss_function(x_char_pred.reshape(-1, x_char_pred.shape[2]), x_char_target.reshape(-1)) + + # y_lexical = Y['lexical'] + # y_context = Y['context'] + # sl = batch['x_sent_len'].detach().cpu().numpy() + # word_repr = [] + # # sent_repr = y_sent + # for ii in range(sl.shape[0]): + # for jj in range(sl[ii]): + # if True: # random.random() < 0.15: + # word_repr.append(y_lexical[ii, jj].unsqueeze(0)) + # word_repr.append(y_context[ii, jj].unsqueeze(0)) + # + # word_repr = torch.cat(word_repr, dim=0) + # word_repr = word_repr.reshape(-1, 2, word_repr.shape[1]) + # loss_cosine = self._cosine_loss(word_repr) + # return {'total_loss': loss_rec + loss_cosine} + return {'total_loss': loss_rec} + + def validation_epoch_end(self, outputs: List[Any]) -> None: + + loss = 0 + for output in outputs: + loss += output['total_loss'] + loss /= len(outputs) + + res = {'val_loss': loss} + self._epoch_results = self._compute_early_stop(res) + self.log('val/early_meta', self._early_stop_meta_val) + self.log('val/loss', loss) + + def configure_optimizers(self): + return torch.optim.AdamW(self.parameters()) + + def _compute_early_stop(self, res): + if res["val_loss"] < self._res['b_loss']: + self._early_stop_meta_val += 1 + self._res['b_loss'] = res["val_loss"] + res['best_loss'] = True + return res + + def _get_device(self): + if self._start_stop.weight.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(self._start_stop.weight.device.type, str(self._start_stop.weight.device.index)) + + def load(self, filename: str): + self.load_state_dict(torch.load(filename, map_location='cpu')) diff --git a/Languasito/languasito/modules.py b/Languasito/languasito/modules.py new file mode 100644 index 000000000..7d356b3b0 --- /dev/null +++ b/Languasito/languasito/modules.py @@ -0,0 +1,358 @@ +import torch +import torch.nn as nn +import pytorch_lightning as pl +import numpy as np +from languasito.utils import mask_concat + + +class LinearNorm(pl.LightningModule): + def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): + super(LinearNorm, self).__init__() + self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias) + + torch.nn.init.xavier_normal_( + self.linear_layer.weight, + gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, x): + return self.linear_layer(x) + + +class ConvNorm(pl.LightningModule): + def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, + padding=None, dilation=1, bias=True, w_init_gain='linear'): + super(ConvNorm, self).__init__() + if padding is None: + assert (kernel_size % 2 == 1) + padding = int(dilation * (kernel_size - 1) / 2) + + self.conv = torch.nn.Conv1d(in_channels, out_channels, + kernel_size=kernel_size, stride=stride, + padding=padding, dilation=dilation, + bias=bias) + + torch.nn.init.xavier_normal_( + self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, signal): + conv_signal = self.conv(signal) + return conv_signal + + +class WordGram(pl.LightningModule): + def __init__(self, num_chars: int, num_langs: int, num_filters=512, char_emb_size=256, case_emb_size=32, + lang_emb_size=32, num_layers=3): + super(WordGram, self).__init__() + NUM_FILTERS = num_filters + self._num_filters = NUM_FILTERS + self._lang_emb = nn.Embedding(num_langs + 1, lang_emb_size) + self._tok_emb = nn.Embedding(num_chars + 1, char_emb_size) + self._case_emb = nn.Embedding(4, case_emb_size) + self._num_layers = num_layers + convolutions_char = [] + cs_inp = char_emb_size + lang_emb_size + case_emb_size + for _ in range(num_layers): + conv_layer = nn.Sequential( + ConvNorm(cs_inp, + NUM_FILTERS, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(NUM_FILTERS)) + convolutions_char.append(conv_layer) + cs_inp = NUM_FILTERS // 2 + lang_emb_size + self._convolutions_char = nn.ModuleList(convolutions_char) + self._pre_out = LinearNorm(NUM_FILTERS // 2, NUM_FILTERS // 2) + + def forward(self, x_char, x_case, x_lang, x_mask, x_word_len): + x_char = self._tok_emb(x_char) + x_case = self._case_emb(x_case) + x_lang = self._lang_emb(x_lang) + + x = torch.cat([x_char, x_case], dim=-1) + x = x.permute(0, 2, 1) + x_lang = x_lang.unsqueeze(1).repeat(1, x_case.shape[1], 1).permute(0, 2, 1) + half = self._num_filters // 2 + count = 0 + res = None + skip = None + for conv in self._convolutions_char: + count += 1 + drop = self.training + if count >= len(self._convolutions_char): + drop = False + if skip is not None: + x = x + skip + + x = torch.cat([x, x_lang], dim=1) + conv_out = conv(x) + tmp = torch.tanh(conv_out[:, :half, :]) * torch.sigmoid((conv_out[:, half:, :])) + if res is None: + res = tmp + else: + res = res + tmp + skip = tmp + x = torch.dropout(tmp, 0.1, drop) + x = x + res + x = x.permute(0, 2, 1) + x = x * x_mask.unsqueeze(2) + pre = torch.sum(x, dim=1, dtype=torch.float) + norm = pre / x_word_len.unsqueeze(1) + # embeds = self._pre_out(norm) + # norm = embeds.norm(p=2, dim=-1, keepdim=True) + # embeds_normalized = embeds.div(norm) + # return embeds_normalized + + return torch.tanh(self._pre_out(norm)) + + def _get_device(self): + if self._lang_emb.weight.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(self._lang_emb.weight.device.type, str(self._lang_emb.weight.device.index)) + + def save(self, path): + torch.save(self.state_dict(), path) + + def load(self, path): + self.load_state_dict(torch.load(path, map_location='cpu')) + + +class GE2ELoss(nn.Module): + + def __init__(self, init_w=10.0, init_b=-5.0, loss_method='softmax'): + ''' + Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1] + Accepts an input of size (N, M, D) + where N is the number of speakers in the batch, + M is the number of utterances per speaker, + and D is the dimensionality of the embedding vector (e.g. d-vector) + Args: + - init_w (float): defines the initial value of w in Equation (5) of [1] + - init_b (float): definies the initial value of b in Equation (5) of [1] + ''' + super(GE2ELoss, self).__init__() + self.w = nn.Parameter(torch.tensor(init_w)) + self.b = nn.Parameter(torch.tensor(init_b)) + self.loss_method = loss_method + + assert self.loss_method in ['softmax', 'contrast'] + + if self.loss_method == 'softmax': + self.embed_loss = self.embed_loss_softmax + if self.loss_method == 'contrast': + self.embed_loss = self.embed_loss_contrast + + def calc_new_centroids(self, dvecs, centroids, spkr, utt): + ''' + Calculates the new centroids excluding the reference utterance + ''' + excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1:])) + excl = torch.mean(excl, 0) + new_centroids = [] + for i, centroid in enumerate(centroids): + if i == spkr: + new_centroids.append(excl) + else: + new_centroids.append(centroid) + return torch.stack(new_centroids) + + def calc_cosine_sim(self, dvecs, centroids): + ''' + Make the cosine similarity matrix with dims (N,M,N) + ''' + cos_sim_matrix = [] + for spkr_idx, speaker in enumerate(dvecs): + cs_row = [] + for utt_idx, utterance in enumerate(speaker): + new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx) + # vector based cosine similarity for speed + cs_row.append(torch.clamp( + torch.mm(utterance.unsqueeze(1).transpose(0, 1), new_centroids.transpose(0, 1)) / ( + torch.norm(utterance) * torch.norm(new_centroids, dim=1)), 1e-6)) + cs_row = torch.cat(cs_row, dim=0) + cos_sim_matrix.append(cs_row) + return torch.stack(cos_sim_matrix) + + def embed_loss_softmax(self, dvecs, cos_sim_matrix): + ''' + Calculates the loss on each embedding $L(e_{ji})$ by taking softmax + ''' + N, M, _ = dvecs.shape + L = [] + for j in range(N): + L_row = [] + for i in range(M): + L_row.append(-torch.nn.functional.log_softmax(cos_sim_matrix[j, i], 0)[j]) + L_row = torch.stack(L_row) + L.append(L_row) + return torch.stack(L) + + def embed_loss_contrast(self, dvecs, cos_sim_matrix): + ''' + Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid + ''' + N, M, _ = dvecs.shape + L = [] + for j in range(N): + L_row = [] + for i in range(M): + centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i]) + excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1:])) + L_row.append(1. - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids)) + L_row = torch.stack(L_row) + L.append(L_row) + return torch.stack(L) + + def forward(self, dvecs): + ''' + Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) + ''' + # Calculate centroids + centroids = torch.mean(dvecs, 1) + + # Calculate the cosine similarity matrix + cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids) + torch.clamp(self.w, 1e-6) + cos_sim_matrix = cos_sim_matrix * self.w + self.b + L = self.embed_loss(dvecs, cos_sim_matrix) + return L.mean() + + +class SkipEncoder(nn.Module): + def __init__(self, input_size, hidden_size, num_layers): + fw_list = [] + bw_list = [] + self._num_layers = num_layers + inp_size = input_size + for ii in range(num_layers): + fw_list.append(nn.LSTM(inp_size, hidden_size, num_layers=1, batch_first=True, bidirectional=False)) + bw_list.append(nn.LSTM(inp_size, hidden_size, num_layers=1, batch_first=True, bidirectional=False)) + inp_size = hidden_size + + self._rnn_fw = nn.ModuleList(fw_list) + self._rnn_bw = nn.ModuleList(bw_list) + + def forward(self, x): + out_fw = [] + out_bw = [] + out = [] + hidden_fw = x + hidden_bw = x + for ii in range(self._num_layers): + out_fw, _ = self._rnn_fw(hidden_fw) + out_bw, _ = self._rnn_bw(torch.flip(hidden_bw, [1])) + out_bw = torch.flip(out_bw, [1]) + out_fw = out_fw + out_bw = out_bw + context = torch.cat([out_fw[:, :-2, :], out_bw[:, 2:, :]], dim=-1) + context = torch.tanh(self._linear_out(context)) + + +class WordDecoder(nn.Module): + def __init__(self, cond_size: int, char_emb_size: int, vocab_size: int, rnn_size: int = 200, rnn_layers: int = 2): + super().__init__() + self._char_emb_size = char_emb_size + self._vocab_size = vocab_size + self._cond_size = cond_size + + self._char_emb = nn.Embedding(vocab_size, char_emb_size) + self._rnn = nn.LSTM(cond_size + char_emb_size, rnn_size, num_layers=rnn_layers, batch_first=True) + self._output = nn.Linear(rnn_size, vocab_size) + + def forward(self, cond, gs_chars=None): + if gs_chars is not None: + cond = cond.unsqueeze(1).repeat(1, gs_chars.shape[1], 1) + gs_chars = self._char_emb(gs_chars) + x_input = torch.cat([cond, gs_chars], dim=-1) + x_out_rnn, _ = self._rnn(x_input) + return self._output(x_out_rnn)[:, :-1, :] + else: + reached_end = [False for ii in range(cond.shape[0])] + last_char = np.ones((cond.shape[0], 1)) * 2 # + last_char = torch.tensor(last_char, dtype=torch.long, device=self._get_device()) + last_char = self._char_emb(last_char) + cond = cond.unsqueeze(1) + index = 0 + decoder_hidden = None + output_list = [] + while True: + decoder_input = torch.cat([cond, last_char], dim=-1) + decoder_output, decoder_hidden = self._rnn(decoder_input, hx=decoder_hidden) + output = self._output(decoder_output) + last_char = torch.argmax(output, dim=-1) + output = last_char.detach().cpu().numpy() + for ii in range(output.shape[0]): + if output[ii] == 3: # + reached_end[ii] = True + + output_list.append(last_char.detach().cpu()) + last_char = self._char_emb(last_char) + + index += 1 + if np.all(reached_end): + break + output = torch.cat(output_list, dim=1).detach().cpu().numpy() + return output + + def _get_device(self): + if self._char_emb.weight.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(self._char_emb.weight.device.type, str(self._char_emb.weight.device.index)) + + +def log1pexp(x): + return torch.where(x < 50, torch.log1p(torch.exp(x)), x) + + +import random + + +class CosineLoss(nn.Module): + def __init__(self): + super().__init__() + self._cs_loss = torch.nn.CosineEmbeddingLoss() + + def _pos_list(self, x): + return x[:, 0, :], x[:, 1, :] + + def _neg_list(self, x): + indices_pos = [] + indices_neg = [] + x = x.reshape(-1, x.shape[2]) + for ii in range(x.shape[0] // 2 - 1): + # for jj in range(ii + 1, x.shape[0] // 2): + indices_pos.append(ii * 2) + indices_pos.append(ii * 2 + 1) + jj = random.randint(0, x.shape[0] // 2 - 1) + while jj == ii: + jj = random.randint(0, x.shape[0] // 2 - 1) + indices_neg.append(jj * 2) + indices_neg.append(jj * 2 + 1) + + indices_pos = torch.tensor(indices_pos, dtype=torch.long, device=self._get_device(x)) + indices_neg = torch.tensor(indices_neg, dtype=torch.long, device=self._get_device(x)) + return x[indices_pos], x[indices_neg] + + def _get_device(self, x): + if x.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(x.device.type, str(x.device.index)) + + def forward(self, x): + pos1, pos2 = self._pos_list(x) + target = torch.ones(pos1.shape[0], device=self._get_device(x)) + loss_pos = self._cs_loss(pos1, pos2, target) + # tmp = pos1 * pos2 + # tmp = torch.mean(tmp, dim=1) + # tmp = log1pexp(-tmp) # torch.log(1 + torch.exp(-tmp)) + # loss_pos = tmp.mean() + + pos, neg = self._neg_list(x) + target = -torch.ones(pos.shape[0], device=self._get_device(x)) + loss_neg = self._cs_loss(pos, neg, target) + # tmp = pos * neg + # tmp = torch.mean(tmp, dim=1) + # tmp = log1pexp(tmp) # torch.log(1 + torch.exp(-tmp)) + # loss_neg = tmp.mean() + + return loss_pos + loss_neg diff --git a/Languasito/languasito/train_lm.py b/Languasito/languasito/train_lm.py new file mode 100644 index 000000000..41f2652a5 --- /dev/null +++ b/Languasito/languasito/train_lm.py @@ -0,0 +1,100 @@ +import sys +import optparse +import pytorch_lightning as pl +from pytorch_lightning.callbacks import EarlyStopping +from torch.utils.data import DataLoader +import torch + +sys.path.append('') + +from languasito.utils import LanguasitoDataset, load_dataset, LanguasitoCollate, Encodings +from languasito.model import Languasito + + +class PrintAndSaveCallback(pl.callbacks.Callback): + def __init__(self, args): + super().__init__() + self.args = args + + def on_validation_end(self, trainer, pl_module): + res = pl_module._epoch_results + if 'best_loss' in res: + trainer.save_checkpoint('{0}.best'.format(self.args.output_base)) + + trainer.save_checkpoint('{0}.last'.format(self.args.output_base)) + + msg = '\n\n\tVal loss: \t{0:.4f}'.format(res['val_loss']) + print(msg) + print("\n") + + +if __name__ == '__main__': + parser = optparse.OptionParser() + parser.add_option('--train', action='store', dest='train_file', default="corpus/ro-train") + parser.add_option('--dev', action='store', dest='dev_file', default="corpus/ro-dev") + parser.add_option('--store', action='store', dest='output_base', default="data/laro") + parser.add_option('--resume', action='store_true', dest='resume') + parser.add_option('--patience', action='store', default=20, type='int', dest='patience', help='Default=20') + parser.add_option('--gpus', action='store', default=1, type='int', dest='gpus', help='Default=1') + parser.add_option('--batch-size', action='store', default=128, type='int', dest='batch_size', help='Default=32') + parser.add_option('--num-workers', action='store', default=4, type='int', dest='num_workers', help='Default=4') + + (params, _) = parser.parse_args(sys.argv) + + #train = load_dataset(params.train_file) + #dev = load_dataset(params.dev_file) + train = LanguasitoDataset() + train.load_file(params.train_file) + dev = LanguasitoDataset() + dev.load_file(params.dev_file) + + enc = Encodings() + enc.update(train) + enc.save('{0}.encodings'.format(params.output_base), full=False) + + collate = LanguasitoCollate(enc) + model = Languasito(enc) + + train_loader = DataLoader(train, batch_size=params.batch_size, collate_fn=collate.collate_fn, shuffle=True, + num_workers=params.num_workers, pin_memory=True) + val_loader = DataLoader(dev, batch_size=params.batch_size, collate_fn=collate.collate_fn, + num_workers=params.num_workers, pin_memory=True) + + early_stopping_callback = EarlyStopping( + monitor='val/early_meta', + patience=params.patience, + verbose=True, + mode='max' + ) + + if params.gpus == 0: + acc = 'ddp_cpu' + plugins = None + else: + acc = 'ddp' + plugins= 'ddp_sharded' + + # if params.resume: + # chk = torch.load('{0}.last'.format(params.output_base)) + # else: + # chk = None + + if params.resume: + checkpoint_path = '{0}.last'.format(params.output_base) + else: + checkpoint_path = None + + trainer = pl.Trainer( + gpus=params.gpus, + accelerator=acc, + plugins=plugins, + num_nodes=1, + default_root_dir='data/', + callbacks=[early_stopping_callback, PrintAndSaveCallback(params)], + val_check_interval=min(10000, len(train) // params.batch_size), + resume_from_checkpoint=checkpoint_path, + # limit_train_batches=5, + # limit_val_batches=2 + ) + + trainer.fit(model, train_loader, val_loader) diff --git a/Languasito/languasito/train_wg.py b/Languasito/languasito/train_wg.py new file mode 100644 index 000000000..41f2652a5 --- /dev/null +++ b/Languasito/languasito/train_wg.py @@ -0,0 +1,100 @@ +import sys +import optparse +import pytorch_lightning as pl +from pytorch_lightning.callbacks import EarlyStopping +from torch.utils.data import DataLoader +import torch + +sys.path.append('') + +from languasito.utils import LanguasitoDataset, load_dataset, LanguasitoCollate, Encodings +from languasito.model import Languasito + + +class PrintAndSaveCallback(pl.callbacks.Callback): + def __init__(self, args): + super().__init__() + self.args = args + + def on_validation_end(self, trainer, pl_module): + res = pl_module._epoch_results + if 'best_loss' in res: + trainer.save_checkpoint('{0}.best'.format(self.args.output_base)) + + trainer.save_checkpoint('{0}.last'.format(self.args.output_base)) + + msg = '\n\n\tVal loss: \t{0:.4f}'.format(res['val_loss']) + print(msg) + print("\n") + + +if __name__ == '__main__': + parser = optparse.OptionParser() + parser.add_option('--train', action='store', dest='train_file', default="corpus/ro-train") + parser.add_option('--dev', action='store', dest='dev_file', default="corpus/ro-dev") + parser.add_option('--store', action='store', dest='output_base', default="data/laro") + parser.add_option('--resume', action='store_true', dest='resume') + parser.add_option('--patience', action='store', default=20, type='int', dest='patience', help='Default=20') + parser.add_option('--gpus', action='store', default=1, type='int', dest='gpus', help='Default=1') + parser.add_option('--batch-size', action='store', default=128, type='int', dest='batch_size', help='Default=32') + parser.add_option('--num-workers', action='store', default=4, type='int', dest='num_workers', help='Default=4') + + (params, _) = parser.parse_args(sys.argv) + + #train = load_dataset(params.train_file) + #dev = load_dataset(params.dev_file) + train = LanguasitoDataset() + train.load_file(params.train_file) + dev = LanguasitoDataset() + dev.load_file(params.dev_file) + + enc = Encodings() + enc.update(train) + enc.save('{0}.encodings'.format(params.output_base), full=False) + + collate = LanguasitoCollate(enc) + model = Languasito(enc) + + train_loader = DataLoader(train, batch_size=params.batch_size, collate_fn=collate.collate_fn, shuffle=True, + num_workers=params.num_workers, pin_memory=True) + val_loader = DataLoader(dev, batch_size=params.batch_size, collate_fn=collate.collate_fn, + num_workers=params.num_workers, pin_memory=True) + + early_stopping_callback = EarlyStopping( + monitor='val/early_meta', + patience=params.patience, + verbose=True, + mode='max' + ) + + if params.gpus == 0: + acc = 'ddp_cpu' + plugins = None + else: + acc = 'ddp' + plugins= 'ddp_sharded' + + # if params.resume: + # chk = torch.load('{0}.last'.format(params.output_base)) + # else: + # chk = None + + if params.resume: + checkpoint_path = '{0}.last'.format(params.output_base) + else: + checkpoint_path = None + + trainer = pl.Trainer( + gpus=params.gpus, + accelerator=acc, + plugins=plugins, + num_nodes=1, + default_root_dir='data/', + callbacks=[early_stopping_callback, PrintAndSaveCallback(params)], + val_check_interval=min(10000, len(train) // params.batch_size), + resume_from_checkpoint=checkpoint_path, + # limit_train_batches=5, + # limit_val_batches=2 + ) + + trainer.fit(model, train_loader, val_loader) diff --git a/Languasito/languasito/utils.py b/Languasito/languasito/utils.py new file mode 100644 index 000000000..b8e65dfa3 --- /dev/null +++ b/Languasito/languasito/utils.py @@ -0,0 +1,450 @@ +import re +from torch.utils.data import Dataset +import json, os +from tqdm.autonotebook import tqdm as tqdm +import torch +import numpy as np +import random +import pickle + + +class LanguasitoTokenizer: + def __init__(self, no_space_language=False): + self._no_space_language = no_space_language + + def __call__(self, text): + if self._no_space_language: + return [ch for ch in text] + else: + toks = [] + tok = '' + for ch in text: + if not ch.isalnum() or ch == ' ': + tok = tok.strip() + if len(tok) != 0: + toks.append(tok) + tok = '' + if ch != ' ': + toks.append(ch) + else: + tok += ch + if tok.strip() != '': + toks.append(tok) + + return toks + # def __call__(self, text): + # if self._no_space_language: + # return [ch for ch in text] + # else: + # punctuation = '''"’'()[]{}<>:,‒–—―…!.«»-?‘’“”;/⁄␠·&@*\\•^¤¢$€£¥₩₪†‡°¡¿¬#№%‰‱¶′§~¨_|¦⁂☞∴‽※"„”''' + # new_text = '' + # for ch in text: + # if re.match(u'[\u4e00-\u9fff]', ch): + # new_text += ' ' + ch + ' ' + # elif ch in punctuation: + # new_text += ' ' + ch + ' ' + # else: + # new_text += ch + # + # tmp = new_text.replace(' ', ' ') + # while tmp != new_text: + # new_text = tmp + # tmp = new_text.replace(' ', ' ') + # new_text = new_text.strip() + # return new_text.split(' ') + + +def mp_job(data): + no_space_lang, lines = data + print(f"\t\ttokenizing {len(lines)} ...") + _st = LanguasitoTokenizer(no_space_language=no_space_lang) + filtered_lines = [] + # new_lines = [] + for line in lines: + toks = _st(line) + if len(toks) > 5 and len(toks) < 50: + valid = True + for tok in toks: + if len(tok) > 20: + valid = False + break + if valid: + filtered_lines.append(toks) + # new_lines.append(line) + + return filtered_lines + + +class LanguasitoDataset(Dataset): + def __init__(self, no_space_lang=False): + self._examples = [] + self._st = LanguasitoTokenizer(no_space_language=no_space_lang) + self.no_space_lang = no_space_lang + + def load_file(self, filename: str): + print(f"Loading {filename}") + + if os.path.exists(filename + ".pickle"): + print("\tloading from cached file ...") + self._examples = pickle.load(open(filename + ".pickle", "rb")) + print(f"\tdataset has {len(self._examples)} lines.") + return + + import multiprocessing + lines = [] + chunks = [] + with open(filename, "r", encoding="utf8") as f: + for line in f: + l = line.strip() + if l == "": + continue + lines.append(l) + if len(lines) > 999999: # 1M + chunks.append(lines) + lines = [] + print(f"\treading chunk #{len(chunks)} ...") + if len(chunks) > 100: # 200 M lines + break + if len(lines) > 0: + chunks.append(lines) + + cpu_count = int(multiprocessing.cpu_count() / 2) + print(f"\tloaded {len(chunks)} chunks, now filtering on {cpu_count} threads ...") + + packed_chunks = [(self.no_space_lang, lines) for lines in chunks] + + p = multiprocessing.Pool(processes=cpu_count) + return_data = p.map(mp_job, packed_chunks) + p.close() + p.join() + + cnt = 0 + for lines in return_data: + for line in lines: + self._examples.append([line, cnt]) + cnt += 1 + + """ + filtered_lines, o_lines = self._filter(lines) + n = len(filtered_lines) + for ii in range(n): + tokenized = filtered_lines[ii] + if o_lines[ii].startswith(" 5 and len(toks) < 50: + valid = True + for tok in toks: + if len(tok) > 20: + valid = False + break + if valid: + filtered_lines.append(toks) + new_lines.append(line) + return filtered_lines, new_lines + + +class LangusitoWordDecomposer: + def __init__(self): + self._tok2int = {} + + def train(self, dataset: LanguasitoDataset, w_cutoff=7, max_vocab_size=1000, max_ngram=-1): + word2count = {} + n = len(dataset) + for ii in tqdm(range(n), ncols=100, desc='Updating encodings'): + tokenized = dataset[ii]['sent1'] + for tok in tokenized: + if tok in word2count: + word2count[tok] += 1 + else: + word2count[tok] = 1 + + word_list = [] + for word in word2count: + if word2count[word] > w_cutoff: + word_list.append(word) + for c in word: + if c not in self._tok2int: + self._tok2int[c] = len(self._tok2int) + + ngram2count = {} + for word in word_list: + # get all ngrams + ngrams = self._extract_ngrams(word, max_ngram) + for ngram in ngrams: + if ngram in ngram2count: + ngram2count[ngram] += 1 + else: + ngram2count[ngram] = 1 + + order2count = {} + for ngram in ngram2count: + key = len(ngram) + if key not in order2count: + order2count[key] = ngram2count[ngram] + else: + order2count[key] += ngram2count[ngram] + + sorted_ngrams = [k for k, v in sorted(ngram2count.items(), reverse=True, + key=lambda item: item[1])] + for ngram in sorted_ngrams[:min(len(sorted_ngrams), max_vocab_size - len(self._tok2int))]: + self._tok2int[ngram] = len(self._tok2int) + + @staticmethod + def _extract_ngrams(word, max_len): + if max_len == -1: + max_len = len(word) + max_size = min(max_len, len(word)) + + ngrams = [] + for ii in range(1, max_size): + for jj in range(len(word) - ii): + ngrams.append(word[jj:jj + ii + 1]) + return ngrams + + @staticmethod + def _find_shortest_path(graph, start, end, path=[]): + path = path + [start] + if start == end: + return path + + shortest = None + for node in graph[start]: + if node not in path: + newpath = LangusitoWordDecomposer._find_shortest_path(graph, node, end, path) + if newpath: + if not shortest or len(newpath) < len(shortest): + shortest = newpath + return shortest + + def _build_graph(self, word): + graph = {} + for ii in range(len(word) - 1): + start_node = ii + graph[start_node] = [start_node + 1] + for jj in range(ii + 2, len(word) + 1): + end_node = jj + tok = word[ii:jj] + if tok in self._tok2int: + graph[start_node].append(end_node) + graph[len(word) - 1] = [len(word)] + graph[len(word)] = [] + return graph + + def tokenize(self, word_list: list): + tokenized = [] + for word in word_list: + graph = self._build_graph(word) + spath = LangusitoWordDecomposer._find_shortest_path(graph, 0, len(word)) + toks = [] + for ii in range(1, len(spath)): + toks.append(word[spath[ii - 1]:spath[ii]]) + tokenized.append(toks) + return tokenized + + +def load_dataset(filename: str) -> LanguasitoDataset: + dataset = LanguasitoDataset() + print(f"Reading dataset file {filename} ...") + lines = open(filename, "r", encoding="utf8").readlines() + for ii in tqdm(range(len(lines)), desc='Loading dataset "{0}"'.format(filename), ncols=100): + fname = lines[ii].strip() + dataset.load_file(fname) + return dataset + + +def mask_concat(representations, drop_prob: float, training: bool, device: str): + if training: + masks = [] + for ii in range(len(representations)): + mask = np.ones((representations[ii].shape[0], representations[ii].shape[1]), dtype=np.long) + masks.append(mask) + + for ii in range(masks[0].shape[0]): + for jj in range(masks[0].shape[1]): + mult = 1 + for kk in range(len(masks)): + p = random.random() + if p < drop_prob: + mult += 1 + masks[kk][ii, jj] = 0 + for kk in range(len(masks)): + masks[kk][ii, jj] *= mult + for kk in range(len(masks)): + masks[kk] = torch.tensor(masks[kk], device=device) + + for kk in range(len(masks)): + representations[kk] = representations[kk] * masks[kk].unsqueeze(2) + + return torch.cat(representations, dim=-1) + + +class Encodings: + def __init__(self, max_vocab_size: int = 10000, min_word_occ: int = 5, min_char_occ: int = 20): + self._max_vocab_size = max_vocab_size + self._min_word_occ = min_word_occ + self._min_char_occ = min_char_occ + self.char2int = {'': 0, '': 1, '': 2, '': 3} + self.word2int = {} + self.char_list = [] + self.word_decomposer = LangusitoWordDecomposer() + + def load(self, filename: str): + json_obj = json.load(open(filename)) + self.char2int = json_obj['char2int'] + if 'word2int' in json_obj: + self.word2int = json_obj['word2int'] + + self.char_list = [' ' for _ in range(len(self.char2int))] + for char in self.char2int: + self.char_list[self.char2int[char]] = char + + self.word_decomposer._tok2int = json_obj['tok2int'] + + def save(self, filename: str, full: bool = True): + json_obj = {'char2int': self.char2int} + json_obj['tok2int'] = self.word_decomposer._tok2int + if full: + json_obj['word2int'] = self.word2int + json.dump(json_obj, open(filename, 'w')) + + def update(self, dataset: LanguasitoDataset): + self.word_decomposer.train(dataset) + word2count = {} + char2count = {} + n = len(dataset) + for ii in tqdm(range(n), ncols=100, desc='Updating encodings'): + tokenized = dataset[ii]['sent1'] + for tok in tokenized: + tok_lower = tok.lower() + if tok_lower not in word2count: + word2count[tok_lower] = 1 + else: + word2count[tok_lower] += 1 + + for ch in tok_lower: + if ch not in char2count: + char2count[ch] = 1 + else: + char2count[ch] += 1 + + # sort dict + sorted_words = [k for k, v in sorted(word2count.items(), key=lambda item: item[1], reverse=True)] + sorted_words = sorted_words[:min(len(sorted_words), self._max_vocab_size)] + for w in sorted_words: + if word2count[w] > self._min_word_occ: + self.word2int[w] = len(self.word2int) + + for ch in char2count: + if char2count[ch] > self._min_char_occ: + self.char2int[ch] = len(self.char2int) + + self.char_list = [' ' for _ in range(len(self.char2int))] + for char in self.char2int: + self.char_list[self.char2int[char]] = char + + +class LanguasitoCollate: + def __init__(self, encodings: Encodings, live: bool = False): + self._encodings = encodings + self._live = live + + def collate_fn(self, batch): + if not self._live: + new_batch = [] + for b in batch: + new_batch.append(b['sent1']) + new_batch.append(b['sent2']) + batch = new_batch + a_sent_len = [len(sent) for sent in batch] + a_word_len = [] + word_list = [] + for sent in batch: + for word in sent: + a_word_len.append(len(word)) + word_list.append(word) + x_sent_len = np.array(a_sent_len, dtype=np.long) + x_word_len = np.array(a_word_len, dtype=np.long) + max_sent_len = np.max(x_sent_len) + max_word_len = np.max(x_word_len) + x_sent_masks = np.zeros((len(batch), max_sent_len), dtype=np.float) + x_word_masks = np.zeros((x_word_len.shape[0], max_word_len + 2), dtype=np.float) + + x_word_char = np.zeros((x_word_len.shape[0], max_word_len + 2), dtype=np.long) + x_word_case = np.zeros((x_word_len.shape[0], max_word_len + 2), dtype=np.long) + word_targets = self._encodings.word_decomposer.tokenize(word_list) + max_word_dec = max([len(w) for w in word_targets]) + x_word_decoder = np.zeros((x_word_len.shape[0], max_word_dec + 2), dtype=np.long) + c_word = 0 + x_lang_sent = np.zeros((len(batch)), dtype=np.long) + x_lang_word = [] + + for iSent in range(len(batch)): + sent = batch[iSent] + x_lang_sent[iSent] = 1 + for iWord in range(len(sent)): + x_word_char[iWord, 0] = 2 # start of token + word = sent[iWord] + x_sent_masks[iSent, iWord] = 1 + x_lang_word.append(1) + target = word_targets[c_word] + x_word_decoder[c_word, 0] = 2 + for iTarget in range(len(target)): + tgt = target[iTarget] + if tgt in self._encodings.word_decomposer._tok2int: + x_word_decoder[c_word, iTarget + 1] = self._encodings.word_decomposer._tok2int[tgt] + 4 + x_word_decoder[c_word, len(target) + 1] = 3 + + for iChar in range(len(word)): + x_word_masks[c_word, iChar] = 1 + ch = word[iChar] + if ch.lower() == ch.upper(): # symbol + x_word_case[c_word, iChar] = 1 + elif ch.lower() != ch: # upper + x_word_case[c_word, iChar] = 2 + else: # lower + x_word_case[c_word, iChar] = 3 + ch = ch.lower() + if ch in self._encodings.char2int: + x_word_char[c_word, iChar + 1] = self._encodings.char2int[ch] + else: + x_word_char[c_word, iChar + 1] = self._encodings.char2int[''] + x_word_char[c_word, len(word) + 1] = 3 # end of token + x_word_masks[c_word, len(word) + 1] = 1 + c_word += 1 + + x_lang_word = np.array(x_lang_word) + response = { + 'x_word_char': torch.tensor(x_word_char), + 'x_word_case': torch.tensor(x_word_case), + 'x_lang_word': torch.tensor(x_lang_word), + 'x_sent_len': torch.tensor(x_sent_len), + 'x_word_len': torch.tensor(x_word_len), + 'x_sent_masks': torch.tensor(x_sent_masks), + 'x_word_masks': torch.tensor(x_word_masks), + 'x_word_targets': torch.tensor(x_word_decoder), + 'x_max_len': max_sent_len + } + + return response diff --git a/Languasito/scripts/import_wiki.py b/Languasito/scripts/import_wiki.py new file mode 100644 index 000000000..cf4847ddf --- /dev/null +++ b/Languasito/scripts/import_wiki.py @@ -0,0 +1,44 @@ +import sys +import optparse +import os +import tqdm + + +def _get_all_files(base_path): + all_files = [] + for path, subdirs, files in os.walk(base_path): + for name in files: + fname = os.path.join(path, name) + if not fname.endswith('.'): + all_files.append(fname) + return all_files + + +def _process(params): + all_files = _get_all_files(params.wiki_base) + f_dev = open(params.dev_file, 'w') + f_train = open(params.train_file, 'w') + for ii in tqdm.tqdm(range(len(all_files))): + if (ii + 1) % params.ratio == 0: + f = f_dev + else: + f = f_train + f.write(all_files[ii] + '\n') + f_train.close() + f_dev.close() + + +if __name__ == '__main__': + parser = optparse.OptionParser() + parser.add_option('--wiki', action='store', dest='wiki_base') + parser.add_option('--train', action='store', dest='train_file') + parser.add_option('--dev', action='store', dest='dev_file') + parser.add_option('--ratio', action='store', default=100, type='int', dest='ratio', + help='train/dev ration (default=100)') + + (params, _) = parser.parse_args(sys.argv) + + if params.wiki_base and params.train_file and params.dev_file: + _process(params) + else: + parser.print_help() diff --git a/Languasito/test.py b/Languasito/test.py new file mode 100644 index 000000000..243c9ef32 --- /dev/null +++ b/Languasito/test.py @@ -0,0 +1,42 @@ +import sys + +sys.path.append('') +import torch + +from languasito.utils import Encodings, LanguasitoCollate +from languasito.model import Languasito + +enc = Encodings() # filename='data/tokenizer-ro-fasttext') + +enc.load('data/laro.encodings') +checkpoint = torch.load('data/laro.best', map_location='cpu') +model = Languasito(encodings=enc) +model.load_state_dict(checkpoint['state_dict']) +model.eval() +collate = LanguasitoCollate(enc, live=True) +text = ['Am citit despre pancreas și sucul pancreatic .'.split(' '), 'Pancreasul secretă suc pancreatic .'.split(' '), + 'Ana are mere dar nu are pere'.split(' '), + 'Steagul României , de asemenea cunoscut ca drapelul , are culorile albastru , galben și roșu .'.split(' ')] + +batch = collate.collate_fn(text) +y = model(batch, return_w=True, imagine=True) + + +def _get_word(w_emb): + word = '' + for ii in range(w_emb.shape[0]): + c_idx = w_emb[ii] + if c_idx == 3: + break + else: + word += enc.char_list[c_idx] + return word + + +w_emb = y['x_char_pred'] +index = 0 +for ii in range(len(text)): + for jj in range(len(text[ii])): + print('{1:30}"{0}"'.format(_get_word(w_emb[index]).strip(), text[ii][jj])) + index += 1 + print() diff --git a/README.md b/README.md index d0ecd347b..e36570ecf 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ ![Version](https://badge.fury.io/py/nlpcube.svg) [![Python 3](https://img.shields.io/badge/python-3-blue.svg)](https://www.python.org/downloads/release/python-360/) [![GitHub stars](https://img.shields.io/github/stars/adobe/NLP-Cube.svg?style=social&label=Star&maxAge=2592000)](https://github.com/adobe/NLP-Cube/stargazers/) ## News +**[05 August 2021]** - We are releasing version 3.0 of NLPCube and models and introducing [FLAVOURS](#flavours). This is a major update, but we did our best to maintain the same API, so previous implementation will not crash. The supported language list is smaller, but you can open an issue for unsupported languages, and we will do our best to add them. Other options include fixing the pip package version 1.0.8 ```pip install nlpcube==0.1.0.8```. **[15 April 2019]** - We are releasing version 1.1 models - check all [supported languages below](#languages). Both 1.0 and 1.1 models are trained on the same [UD2.2 corpus](http://hdl.handle.net/11234/1-2837); however, models 1.1 do not use vector embeddings, thus reducing disk space and time required to use them. Some languages actually have a slightly increased accuracy, some a bit decreased. By default, NLP Cube will use the latest (at this time) 1.1 models. @@ -44,32 +45,42 @@ pip3 install -U nlpcube To use NLP-Cube ***programmatically** (in Python), follow [this tutorial](examples/1.%20NLP-Cube%20Quick%20Tutorial.ipynb) The summary would be: -``` +```python from cube.api import Cube # import the Cube object cube=Cube(verbose=True) # initialize it -cube.load("en") # select the desired language (it will auto-download the model on first run) +cube.load("en", device='cpu') # select the desired language (it will auto-download the model on first run) text="This is the text I want segmented, tokenized, lemmatized and annotated with POS and dependencies." -sentences=cube(text) # call with your own text (string) to obtain the annotations +document=cube(text) # call with your own text (string) to obtain the annotations ``` -The ``sentences`` object now contains the annotated text, one sentence at a time. To print the third word's POS (in the first sentence), just run: +The ``document`` object now contains the annotated text, one sentence at a time. To print the third words's POS (in the first sentence), just run: ``` -print(sentences[0][2].upos) # [0] is the first sentence and [2] is the third word +print(document.sentences[0][2].upos) # [0] is the first sentence and [2] is the third word ``` Each token object has the following attributes: ``index``, ``word``, ``lemma``, ``upos``, ``xpos``, ``attrs``, ``head``, ``label``, ``deps``, ``space_after``. For detailed info about each attribute please see the standard CoNLL format. -### Webserver Usage +### Flavours -To use NLP-Cube as a **web service**, you need to -[locally install NLP-Cube](examples/2.%20Advanced%20usage%20-%20NLP-Cube%20local%20installation.ipynb) -and start the server: +Previous versions on NLP-Cube were trained on individual treebanks. This means that the same language was supported by +multiple models at the same time. For instance, you could parse English (en) text with `en_ewt`, `en_esl`, `en_lines`, +etc. The current version of NLPCube combines all flavours of a treebank under the same umbrella, by jointly optimizing +a conditioned model. You only need to load the base language, for example `en` and then select which flavour to apply +at runtime: -For example, the following command will start the server and preload languages: en, fr and de. -```bash -cd cube -python3 webserver.py --port 8080 --lang=en --lang=fr --lang=de -``` +```text +from cube.api import Cube # import the Cube object +cube=Cube(verbose=True) # initialize it +cube.load("en", device='cpu') # select the desired language (it will auto-download the model on first run) +text="This is the text I want segmented, tokenized, lemmatized and annotated with POS and dependencies." -To test, open the following [link](http://localhost:8080/nlp?lang=en&text=This%20is%20a%20simple%20test) (please copy the address of the link as it is a local address and port link) + +# Parse using the default flavour (in this case EWT) +document=cube(text) # call with your own text (string) to obtain the annotations +# or you can specify a flavour +document=cube(text, flavour='en_lines') +``` + +### Webserver Usage +The current version dropped supported, since most people preferred to implement their one NLPCube as a service. ## Cite @@ -94,7 +105,9 @@ or, in bibtex format: ``` -## Languages and performance +## Languages and performance + +For comparison, the performance of 3.0 models is reported on the 2.2 UD corpus, but distributed models are obtained from UD 2.7. Results are reported against the test files for each language (available in the UD 2.2 corpus) using the 2018 conll eval script. Please see more info about what [each metric represents here](http://universaldependencies.org/conll18/evaluation.html). @@ -104,148 +117,37 @@ Notes: |Language|Model|Token|Sentence|UPOS|XPOS|AllTags|Lemmas|UAS|LAS| |--------|-----|:---:|:------:|:--:|:--:|:-----:|:----:|:-:|:-:| -|Afrikaans| -| |af-1.0|99.97|99.65|97.28|93.0|91.53|96.42|87.61|83.96| -| |af-1.1|99.99|99.29|96.72|92.29|90.87|96.48|87.32|83.31| -|Ancient Greek| -| |grc-1.0|100.0|18.13|94.92|95.32|84.17|86.59|72.44|67.73| -| |grc-1.1|100.0|17.61|96.87|97.35|88.36|88.41|73.4|69.36| -|Arabic| -| |ar-1.0|99.98|61.05|73.42|69.75|68.12|41.26|53.94|50.31| -| |ar-1.1|99.99|60.53|73.27|68.98|65.95|40.87|53.06|49.45| -|Armenian| -| |hy-1.0|97.34|87.52|74.13|96.76|41.51|60.58|11.41|1.7| -|Basque| -| |eu-1.0|99.97|99.83|94.93|99.97|87.24|90.75|85.49|81.35| -| |eu-1.1|99.97|99.75|95.0|99.97|88.14|90.74|85.1|80.91| -|Bulgarian| -| |bg-1.0|99.94|92.8|98.51|95.6|93.99|91.59|92.38|88.84| -| |bg-1.1|99.93|93.36|98.36|95.91|94.46|92.02|92.39|88.76| -|Buryat| -| |bxr-1.0|83.26|31.52|38.08|83.26|16.74|16.05|14.44|6.5| -|Catalan| -| |ca-1.0|99.98|99.27|98.17|98.23|96.63|97.83|92.33|89.95| -| |ca-1.1|99.99|99.51|98.2|98.22|96.72|97.8|92.14|89.6| |Chinese| -| |zh-1.0|93.03|99.1|88.22|88.15|86.91|92.74|73.43|69.52| -| |zh-1.1|92.34|99.1|86.75|86.66|85.35|92.05|71.0|67.04| -|Croatian| -| |hr-1.0|99.92|95.56|97.66|99.92|89.49|93.85|90.61|85.77| -| |hr-1.1|99.95|95.84|97.56|99.95|89.49|94.01|89.95|84.97| -|Czech| -| |cs-1.0|99.99|83.79|98.75|95.54|93.61|95.79|90.67|88.46| -| |cs-1.1|99.99|84.19|98.54|95.33|94.09|95.7|90.72|88.52| -|Danish| -| |da-1.0|99.85|91.79|96.79|99.85|94.29|96.53|85.93|83.05| -| |da-1.1|99.82|92.64|96.52|99.82|94.39|96.21|85.09|81.83| -|Dutch| -| |nl-1.0|99.89|90.75|95.49|93.84|91.73|95.72|89.48|86.1| -| |nl-1.1|99.91|90.89|95.62|93.92|92.58|95.87|89.76|86.4| +| |zh-1.0|93.03|99.10|88.22|88.15|86.91|92.74|73.43|69.52| +| |zh-1.1|92.34|99.10|86.75|86.66|85.35|92.05|71.00|67.04| +| |zh.3.0|95.88|87.36|91.67|83.54|82.74|85.88|79.15|70.08| |English| | |en-1.0|99.25|72.8|95.34|94.83|92.48|95.62|84.7|81.93| | |en-1.1|99.2|70.94|94.4|93.93|91.04|95.18|83.3|80.32| -|Estonian| -| |et-1.0|99.9|91.81|96.02|97.18|91.35|93.26|86.04|82.29| -| |et-1.1|99.91|91.92|96.8|97.92|93.17|93.9|86.13|82.91| -|Finnish| -| |fi-1.0|99.7|88.73|95.45|96.44|90.29|83.69|87.18|83.89| -| |fi-1.1|99.65|89.23|96.22|97.07|91.8|84.02|87.83|84.96| +| |en-3.0|98.95|75.00|96.01|95.71|93.75|96.06|87.06|84.61| |French| | |fr-1.0|99.68|94.2|92.61|95.46|90.79|93.08|84.96|80.91| | |fr-1.1|99.67|95.31|92.51|95.45|90.8|93.0|83.88|80.16| -|Galician| -| |gl-1.0|99.89|97.16|83.01|82.51|81.58|82.95|65.69|61.08| -| |gl-1.1|99.91|97.28|82.6|82.12|80.96|82.71|62.65|58.2| +| |fr-3.0|99.71|93.92|97.33|99.56|96.61|90.79|89.81|87.24| |German| | |de-1.0|99.7|81.19|91.38|94.26|80.37|75.8|79.6|74.35| | |de-1.1|99.77|81.99|90.47|93.82|79.79|75.46|79.3|73.87| -|Gothic| -| |got-1.0|100.0|21.59|93.1|93.8|80.58|83.74|67.23|59.67| -|Greek| -| |el-1.0|99.88|89.46|93.7|93.54|87.14|88.92|85.63|82.05| -| |el-1.1|99.88|89.53|93.28|93.24|87.95|88.65|84.51|79.88| -|Hebrew| -| |he-1.0|99.93|99.69|54.13|54.17|51.49|54.13|34.84|32.29| -| |he-1.1|99.94|100.0|52.78|52.78|49.9|53.45|32.13|29.42| -|Hindi| -| |hi-1.0|99.98|98.84|97.16|96.43|90.29|97.48|94.66|91.26| -| |hi-1.1|100.0|99.11|96.81|96.28|89.74|97.4|94.56|90.96| +| |de-3.0|99.77|86.25|94.70|97.00|85.02|82.73|87.08|82.69| |Hungarian| | |hu-1.0|99.8|94.18|94.52|99.8|86.22|91.07|81.57|75.95| | |hu-1.1|99.88|97.77|93.11|99.88|86.79|91.18|77.89|70.94| -|Indonesian| -| |id-1.0|99.95|93.59|93.13|94.15|87.65|82.19|85.01|78.18| -| |id-1.1|100.0|94.58|92.95|92.81|86.27|81.51|84.73|77.99| -|Irish| -| |ga-1.0|99.56|95.38|90.95|90.07|74.1|87.51|76.32|64.74| +| |hu-3.0|99.75|91.64|96.43|99.75|89.89|91.31|86.34|81.29| |Italian| | |it-1.0|99.89|98.14|86.86|86.67|84.97|87.03|78.3|74.59| | |it-1.1|99.92|99.07|86.58|86.4|84.53|86.75|76.38|72.35| -|Japanese| -| |ja-1.0|92.73|94.92|90.05|92.73|90.02|91.75|80.47|77.97| -| |ja-1.1|92.42|94.92|90.28|92.42|90.28|91.66|79.94|77.79| -|Kazakh| -| |kk-1.0|92.26|75.57|57.38|55.75|22.12|21.35|39.55|19.48| -|Korean| -| |ko-1.0|99.87|93.9|94.66|86.92|83.81|38.7|85.52|81.39| -| |ko-1.1|99.88|94.23|94.61|88.41|85.27|38.68|85.16|80.89| -|Kurmanji| -| |kmr-1.0|89.92|88.86|53.66|52.52|25.96|53.94|12.06|5.53| -|Latin| -| |la-1.0|99.97|92.5|97.95|93.75|91.76|96.9|89.2|86.29| -| |la-1.1|99.99|92.75|98.22|94.03|92.16|97.18|89.19|86.58| -|Latvian| -| |lv-1.0|99.66|96.35|93.43|82.52|79.99|89.47|83.04|77.98| -|North Sami| -| |sme-1.0|99.75|98.79|86.07|87.38|71.34|80.9|66.54|56.93| -|Norwegian| -| |no_bokmaal-1.0|99.92|90.93|84.24|99.92|73.68|71.68|78.24|70.83| -| |no_bokmaal-1.1|99.92|90.32|84.69|99.92|74.84|71.47|77.71|70.63| -| |no_nynorsk-1.0|99.96|91.08|97.33|99.96|93.87|85.82|90.33|88.02| -| |no_nynorsk-1.1|99.96|92.18|97.47|99.96|94.75|86.07|90.23|87.98| -|Old Church Slavonic| -| |cu-1.0|100.0|28.99|92.88|93.09|81.85|83.16|72.18|65.43| -|Persian| -| |fa-1.0|100.0|97.91|96.34|96.17|95.51|89.4|88.35|85.08| -| |fa-1.1|100.0|99.0|95.92|95.78|95.05|89.32|87.43|83.38| -|Portuguese| -| |pt-1.0|99.69|87.88|85.02|88.39|81.35|86.23|76.38|72.99| -| |pt-1.1|99.75|88.1|84.39|88.46|79.79|85.85|75.11|71.61| -|Romanian| +| |it-3.0|99.92|98.13|98.26|98.15|97.34|97.76|94.07|92.66| +|Romanian (RO-RRT)| | |ro-1.0|99.74|95.56|97.42|96.59|95.49|96.91|90.38|85.23| | |ro-1.1|99.71|95.42|96.96|96.32|94.98|96.57|90.14|85.06| -|Russian| -| |ru-1.0|99.71|98.79|98.4|99.71|95.55|93.89|92.7|90.97| -| |ru-1.1|99.73|98.5|98.48|99.73|95.37|93.8|92.88|90.99| -|Serbian| -| |sr-1.0|99.97|92.61|97.61|99.97|91.54|92.93|90.89|86.92| -| |sr-1.1|99.97|92.0|97.88|99.97|92.57|93.31|90.96|87.04| -|Slovak| -| |sk-1.0|99.97|86.0|95.82|82.3|78.43|90.35|88.83|85.69| -| |sk-1.1|99.95|86.67|95.33|81.01|76.98|89.87|87.64|83.84| -|Slovenian| -| |sl-1.0|99.91|97.51|97.85|92.52|91.27|96.35|91.4|89.38| -| |sl-1.1|99.87|97.64|97.62|93.29|90.99|96.36|91.46|89.19| +| |ro-3.0|99.80|95.64|97.67|97.11|96.76|97.55|92.06|87.67| |Spanish| | |es-1.0|99.98|98.32|98.0|98.0|96.62|98.05|90.53|88.27| -| |es-1.1|99.98|98.4|98.01|98.0|96.6|97.99|90.51|88.16| -|Swedish| -| |sv-1.0|99.94|92.54|97.21|95.18|92.88|97.06|88.09|84.74| -| |sv-1.1|99.36|91.22|92.74|0.0|0.0|89.37|78.14|71.86| -|Turkish| -| |tr-1.0|99.89|97.4|90.37|89.56|81.59|87.4|65.22|58.26| -| |tr-1.1|99.88|96.79|90.79|90.17|83.26|87.84|64.69|57.07| -|Ukrainian| -| |uk-1.0|99.65|93.96|96.31|88.23|86.0|92.08|86.25|82.96| -| |uk-1.1|99.76|93.58|96.0|88.17|85.39|92.28|84.9|81.04| -|Upper Sorbian| -| |hsb-1.0|98.59|69.15|59.61|98.59|37.96|22.33|11.11|3.35| -|Urdu| -| |ur-1.0|100.0|98.6|93.55|91.69|77.41|97.33|87.86|81.99| -| |ur-1.1|100.0|98.6|92.85|91.02|77.18|97.2|87.12|80.83| -|Uyghur| -| |ug-1.0|99.91|83.83|87.85|91.58|73.93|90.17|74.36|60.5| -| |ug-1.1|99.7|84.18|88.07|90.38|75.28|92.28|75.16|62.13| -|Vietnamese| -| |vi-1.0|87.2|92.88|78.35|76.43|76.18|81.47|51.59|45.49| -| |vi-1.1|86.87|92.51|76.72|74.57|72.27|81.31|50.29|43.76| +| |es-1.1|99.98|98.40|98.01|98.00|96.6|97.99|90.51|88.16| +| |es-3.0|99.96|97.17|96.88|99.91|94.88|98.17|92.11|89.86| + + diff --git a/cube/.gitignore b/_cube/.gitignore similarity index 100% rename from cube/.gitignore rename to _cube/.gitignore diff --git a/cube/graph/__init__.py b/_cube/__init__.py similarity index 100% rename from cube/graph/__init__.py rename to _cube/__init__.py diff --git a/_cube/api.py b/_cube/api.py new file mode 100644 index 000000000..ff06e769b --- /dev/null +++ b/_cube/api.py @@ -0,0 +1,240 @@ +# -*- coding: utf-8 -*- + +import sys +import os + +class Cube(object): + def __init__(self, verbose=False, random_seed=None, memory=512, autobatch=False, use_gpu=False): + """ + Create an empty instance for Cube + Before it can be used, you must call @method load with @param language_code set to your target language + """ + self._loaded = False + self._verbose = verbose + import dynet_config + + if random_seed != None: + if not isinstance(random_seed, int): + raise Exception ("Random seed must be an integer!") + if random_seed == 0: + print("[Warning] While Python and Numpy's seeds are now set to 0, DyNet uses 0 to reset the seed generator (fully random). Use any non-zero int value to set DyNet to a fixed random seed.") + # set python random seed + import random + random.seed(random_seed) + #set numpy random seed + import numpy as np + np.random.seed(random_seed) + else: + random_seed = 0 # this is the default value for DyNet (meaning full random) + + dynet_config.set(mem=memory, random_seed=random_seed, autobatch=autobatch) + if use_gpu: + dynet_config.set_gpu() + + def load(self, language_code, version="latest", local_models_repository=None, local_embeddings_file=None, tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True): + """ + Loads the pipeline with all available models for the target language. + + @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes + @param version: "latest" to get the latest version, or other specific version in like "1.0", "2.1", etc . + + """ + from .io_utils.encodings import Encodings + from .io_utils.embeddings import WordEmbeddings + from .io_utils.model_store import ModelMetadata, ModelStore + from .io_utils.config import TieredTokenizerConfig, CompoundWordConfig, LemmatizerConfig, TaggerConfig, ParserConfig + from .generic_networks.tokenizers import TieredTokenizer + from .generic_networks.token_expanders import CompoundWordExpander + from .generic_networks.lemmatizers import FSTLemmatizer + from .generic_networks.taggers import BDRNNTagger + from .generic_networks.parsers import BDRNNParser + + self._tokenizer = None # tokenizer object, default is None + self._compound_word_expander = None # compound word expander, default is None + self._lemmatizer = None # lemmatizer object, default is None + self._parser = None # parser object, default is None + self._tagger = None # tagger object, default is None + self.metadata = ModelMetadata() + + # Initialize a ModelStore object + if local_models_repository: + model_store_object = ModelStore(disk_path=local_models_repository) + else: + model_store_object = ModelStore() + + # Find a local model or download it if it does not exist, returning the local model folder path + model_folder_path = model_store_object.find(lang_code=language_code, version=version, verbose=self._verbose) + + # If the model contains metadata, load it + if os.path.isfile(os.path.join(model_folder_path, "metadata.json")): + self.metadata.read(os.path.join(model_folder_path, "metadata.json")) + else: + self.metadata = None + + # Load embeddings + embeddings = WordEmbeddings(verbose=False) + if self._verbose: + sys.stdout.write('\tLoading embeddings ... \n') + if local_embeddings_file is not None: + embeddings.read_from_file(local_embeddings_file, None, full_load=False) + else: # embeddings file is not manually specified + if self.metadata is None: # no metadata exists + raise Exception("When using a locally-trained model please specify a path to a local embeddings file (local_embeddings_file cannot be None).") + else: # load from the metadata path + if self.metadata.embeddings_file_name is None or self.metadata.embeddings_file_name == "": + # load a dummy embedding + embeddings.load_dummy_embeddings() + else: + # load full embedding from file + emb_path = os.path.join(model_store_object.embeddings_repository, self.metadata.embeddings_file_name) + if not os.path.exists(emb_path): + raise Exception("Embeddings file not found: {}".format(emb_path)) + embeddings.read_from_file(emb_path, None, full_load=False) + + # 1. Load tokenizer + if tokenization: + if not os.path.isfile(os.path.join(model_folder_path, 'tokenizer-tok.bestAcc')): + sys.stdout.write('\tTokenization is not available on this model. \n') + else: + if self._verbose: + sys.stdout.write('\tLoading tokenization model ...\n') + tokenizer_encodings = Encodings(verbose=False) + tokenizer_encodings.load(os.path.join(model_folder_path, 'tokenizer.encodings')) + config = TieredTokenizerConfig(os.path.join(model_folder_path, 'tokenizer.conf')) + self._tokenizer = TieredTokenizer(config, tokenizer_encodings, embeddings, runtime=True) + self._tokenizer.load(os.path.join(model_folder_path, 'tokenizer')) + + # 2. Load compound + if compound_word_expanding: + if not os.path.isfile(os.path.join(model_folder_path, 'compound.bestAcc')): + if self._verbose: # supress warning here because many languages do not have compund words + sys.stdout.write('\tCompound word expansion is not available on this model. \n') + else: + if self._verbose: + sys.stdout.write('\tLoading compound word expander model ...\n') + compound_encodings = Encodings(verbose=False) + compound_encodings.load(os.path.join(model_folder_path, 'compound.encodings')) + config = CompoundWordConfig(os.path.join(model_folder_path, 'compound.conf')) + self._compound_word_expander = CompoundWordExpander(config, compound_encodings, embeddings, + runtime=True) + self._compound_word_expander.load(os.path.join(model_folder_path, 'compound.bestAcc')) + + # 3. Load lemmatizer + if lemmatization: + if not os.path.isfile(os.path.join(model_folder_path, 'lemmatizer.bestAcc')): + sys.stdout.write('\tLemmatizer is not available on this model. \n') + else: + if self._verbose: + sys.stdout.write('\tLoading lemmatization model ...\n') + lemmatizer_encodings = Encodings(verbose=False) + lemmatizer_encodings.load(os.path.join(model_folder_path, 'lemmatizer.encodings')) + config = LemmatizerConfig(os.path.join(model_folder_path, 'lemmatizer.conf')) + self._lemmatizer = FSTLemmatizer(config, lemmatizer_encodings, embeddings, runtime=True) + self._lemmatizer.load(os.path.join(model_folder_path, 'lemmatizer.bestAcc')) + + # 4. Load tagger + if tagging or lemmatization: # we need tagging for lemmatization + if not os.path.isfile(os.path.join(model_folder_path, 'tagger.bestUPOS')): + sys.stdout.write('\tTagging is not available on this model. \n') + if lemmatization: + sys.stdout.write('\t\tDisabling the lemmatization model due to missing tagger. \n') + self._lemmatizer = None + else: + if self._verbose: + sys.stdout.write('\tLoading tagger model ...\n') + tagger_encodings = Encodings(verbose=False) + tagger_encodings.load(os.path.join(model_folder_path, 'tagger.encodings')) + config = TaggerConfig(os.path.join(model_folder_path, 'tagger.conf')) + self._tagger = [None, None, None] + self._tagger[0] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) + self._tagger[0].load(os.path.join(model_folder_path, 'tagger.bestUPOS')) + self._tagger[1] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) + self._tagger[1].load(os.path.join(model_folder_path, 'tagger.bestXPOS')) + self._tagger[2] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) + self._tagger[2].load(os.path.join(model_folder_path, 'tagger.bestATTRS')) + + # 5. Load parser + if parsing: + if not os.path.isfile(os.path.join(model_folder_path, 'parser.bestUAS')): + sys.stdout.write('\tParsing is not available on this model... \n') + else: + if self._verbose: + sys.stdout.write('\tLoading parser model ...\n') + parser_encodings = Encodings(verbose=False) + parser_encodings.load(os.path.join(model_folder_path, 'parser.encodings')) + config = ParserConfig(os.path.join(model_folder_path, 'parser.conf')) + self._parser = BDRNNParser(config, parser_encodings, embeddings, runtime=True) + self._parser.load(os.path.join(model_folder_path, 'parser.bestUAS')) + + self._loaded = True + if self._verbose: + sys.stdout.write('Model loading complete.\n\n') + + def __call__(self, text): + if not self._loaded: + raise Exception("Cube object is initialized but no model is loaded (eg.: call cube.load('en') )") + + sequences = [] + if self._tokenizer: + if not isinstance(text, str): + raise Exception("The text argument must be a string!") + # split text by lines + input_lines = text.split("\n") + for input_line in input_lines: + sequences+=self._tokenizer.tokenize(input_line) + else: + if not isinstance(text, list): + raise Exception("The text argument must be a list of lists of tokens!") + sequences = text # the input should already be tokenized + + if self._compound_word_expander: + sequences = self._compound_word_expander.expand_sequences(sequences) + + if self._parser: + sequences = self._parser.parse_sequences(sequences) + + if self._tagger or self._lemmatizer: + import copy + new_sequences = [] + for sequence in sequences: + new_sequence = copy.deepcopy(sequence) + predicted_tags_UPOS = self._tagger[0].tag(new_sequence) + predicted_tags_XPOS = self._tagger[1].tag(new_sequence) + predicted_tags_ATTRS = self._tagger[2].tag(new_sequence) + for entryIndex in range(len(new_sequence)): + new_sequence[entryIndex].upos = predicted_tags_UPOS[entryIndex][0] + new_sequence[entryIndex].xpos = predicted_tags_XPOS[entryIndex][1] + new_sequence[entryIndex].attrs = predicted_tags_ATTRS[entryIndex][2] + new_sequences.append(new_sequence) + sequences = new_sequences + + if self._lemmatizer: + sequences = self._lemmatizer.lemmatize_sequences(sequences) + + return sequences + + +if __name__ == "__main__": + cube = Cube(verbose=True) + cube.load('en', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) + cube.metadata.info() + + text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." + + sentences = cube(text) + + for sentence in sentences: + print() + for token in sentence: + line = "" + line += str(token.index) + "\t" + line += token.word + "\t" + line += token.lemma + "\t" + line += token.upos + "\t" + line += token.xpos + "\t" + line += token.attrs + "\t" + line += str(token.head) + "\t" + line += token.label + "\t" + line += token.deps + "\t" + line += token.space_after + print(line) diff --git a/cube/misc/__init__.py b/_cube/generic_networks/__init__.py similarity index 100% rename from cube/misc/__init__.py rename to _cube/generic_networks/__init__.py diff --git a/cube/generic_networks/character_embeddings.py b/_cube/generic_networks/character_embeddings.py similarity index 83% rename from cube/generic_networks/character_embeddings.py rename to _cube/generic_networks/character_embeddings.py index 69722c86a..a42a77233 100644 --- a/cube/generic_networks/character_embeddings.py +++ b/_cube/generic_networks/character_embeddings.py @@ -22,6 +22,7 @@ class CharacterNetwork: def __init__(self, character_embeddings_size, encodings, rnn_size=100, rnn_layers=1, embeddings_size=100, + lang_embeddings_size=100, model=None, runtime=False): if model is None: self.model = dy.Model() @@ -39,7 +40,8 @@ def __init__(self, character_embeddings_size, encodings, rnn_size=100, rnn_layer self.rnn_bw = [] self.rnn_layers = rnn_layers self.rnn_size = rnn_size - input_size = character_embeddings_size + 3 + input_size = character_embeddings_size + 3 + lang_embeddings_size + for _ in range(rnn_layers): if runtime: self.rnn_fw.append(dy.VanillaLSTMBuilder(1, input_size, rnn_size, self.model)) @@ -49,28 +51,23 @@ def __init__(self, character_embeddings_size, encodings, rnn_size=100, rnn_layer self.rnn_fw.append(orthonormal_VanillaLSTMBuilder(1, input_size, rnn_size, self.model)) self.rnn_bw.append(orthonormal_VanillaLSTMBuilder(1, input_size, rnn_size, self.model)) - input_size = rnn_size * 2 + input_size = rnn_size * 2 + lang_embeddings_size + + lemb_size = 2* lang_embeddings_size + #if rnn_layers > 1: + # lemb_size = 2 * lang_embeddings_size self.linearW = self.model.add_parameters( - (embeddings_size, rnn_size * 4)) # last state and attention over the other states + (embeddings_size, rnn_size * 4 + lemb_size)) # last state and attention over the other states self.linearB = self.model.add_parameters((embeddings_size)) - self.att_w1 = self.model.add_parameters((rnn_size, rnn_size * 2)) + self.att_w1 = self.model.add_parameters((rnn_size, rnn_size * 2 + lang_embeddings_size)) self.att_w2 = self.model.add_parameters((rnn_size, rnn_size * 2)) self.att_v = self.model.add_parameters((1, rnn_size)) - def compute_embeddings(self, word, runtime=True): + def compute_embeddings(self, word, runtime=True, language_embeddings=None): x_list = [] - import sys import copy - if sys.version_info[0] == 2: - if not isinstance(word, unicode): - - uniword = unicode(word, 'utf-8') - else: - uniword = copy.deepcopy(word) - else: - uniword = copy.deepcopy(word) - # print (uniword) + uniword = copy.deepcopy(word) uniword = re.sub('\d', '0', uniword) for i in range(len(uniword)): char = uniword[i] @@ -87,6 +84,13 @@ def compute_embeddings(self, word, runtime=True): else: x_list.append(dy.concatenate([self.character_lookup[self.encodings.char2int['']], style_emb])) + # update characters with language embeddings + if language_embeddings is not None: + tmp = [] + for x in x_list: + tmp.append(dy.concatenate([x, language_embeddings])) + x_list = tmp + rnn_outputs = x_list rnn_states_fw = None rnn_states_bw = None @@ -104,6 +108,7 @@ def compute_embeddings(self, word, runtime=True): rnn_bw = rnn_bw.initial_state() rnn_states_fw = [] rnn_states_bw = [] + for x in rnn_outputs: rnn_fw = rnn_fw.add_input(x) rnn_states_fw.append(rnn_fw) @@ -114,11 +119,11 @@ def compute_embeddings(self, word, runtime=True): bw.append(rnn_states_bw[-1].output()) rnn_outputs = [] for x1, x2 in zip(fw, reversed(bw)): - rnn_outputs.append(dy.concatenate([x1, x2])) + rnn_outputs.append(dy.concatenate([x1, x2, language_embeddings])) attention = self._attend(rnn_outputs, rnn_states_fw[-1], rnn_states_bw[-1]) - pre_linear = dy.concatenate([fw[-1], bw[-1], attention]) + pre_linear = dy.concatenate([fw[-1], bw[-1], attention, language_embeddings]) embedding = dy.tanh(self.linearW.expr(update=True) * pre_linear + self.linearB.expr(update=True)) return embedding, rnn_outputs diff --git a/_cube/generic_networks/crf.py b/_cube/generic_networks/crf.py new file mode 100644 index 000000000..0874f2c18 --- /dev/null +++ b/_cube/generic_networks/crf.py @@ -0,0 +1,377 @@ +# +# Author: Tiberiu Boros +# +# Copyright (c) 2019 Adobe Systems Incorporated. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Code adapted from https://github.com/rguthrie3/BiLSTM-CRF/blob/master/model.py +# and from https://github.com/neulab/cmu-ner/blob/master/models/decoders.py + +import dynet as dy +import numpy as np + + +class CRFDecoder: + def __init__(self, model, src_output_dim, tag_emb_dim, tag_size, constraints=None): + self.model = model + self.start_id = tag_size + self.end_id = tag_size + 1 + self.tag_size = tag_size + 2 + tag_size = tag_size + 2 + + # optional: transform the hidden space of src encodings into the tag embedding space + self.W_src2tag_readout = model.add_parameters((tag_emb_dim, src_output_dim)) + self.b_src2tag_readout = model.add_parameters((tag_emb_dim)) + self.b_src2tag_readout.zero() + + self.W_scores_readout2tag = model.add_parameters((tag_size, tag_emb_dim)) + self.b_scores_readout2tag = model.add_parameters((tag_size)) + self.b_scores_readout2tag.zero() + + # (to, from), trans[i] is the transition score to i + init_transition_matrix = np.random.randn(tag_size, tag_size) # from, to + # init_transition_matrix[self.start_id, :] = -1000.0 + # init_transition_matrix[:, self.end_id] = -1000.0 + init_transition_matrix[self.end_id, :] = -1000.0 + init_transition_matrix[:, self.start_id] = -1000.0 + if constraints is not None: + init_transition_matrix = self._constrained_transition_init(init_transition_matrix, constraints) + # print init_transition_matrix + self.transition_matrix = model.lookup_parameters_from_numpy(init_transition_matrix) + + self.interpolation = True # args.interp_crf_score + if self.interpolation: + self.W_weight_transition = model.add_parameters((1, tag_emb_dim)) + self.b_weight_transition = model.add_parameters((1)) + self.b_weight_transition.zero() + + def learn(self, src_enc, tgt_tags): + return self.decode_loss(src_enc, [tgt_tags]) + + def tag(self, src_enc): + return self.decoding(src_enc)[1] + + def _constrained_transition_init(self, transition_matrix, contraints): + ''' + :param transition_matrix: numpy array, (from, to) + :param contraints: [[from_indexes], [to_indexes]] + :return: newly initialized transition matrix + ''' + for cons in contraints: + transition_matrix[cons[0], cons[1]] = -1000.0 + return transition_matrix + + def _log_sum_exp_dim_0(self, x): + # numerically stable log_sum_exp + dims = x.dim() + max_score = dy.max_dim(x, 0) # (dim_1, batch_size) + if len(dims[0]) == 1: + max_score_extend = max_score + else: + max_score_reshape = dy.reshape(max_score, (1, dims[0][1]), batch_size=dims[1]) + max_score_extend = dy.concatenate([max_score_reshape] * dims[0][0]) + x = x - max_score_extend + exp_x = dy.exp(x) + # (dim_1, batch_size), if no dim_1, return ((1,), batch_size) + log_sum_exp_x = dy.log(dy.mean_dim(exp_x, d=[0], b=False) * dims[0][0]) + return log_sum_exp_x + max_score + + def forward_alg(self, tag_scores): + ''' Forward DP for CRF. + tag_scores (list of batched dy.Tensor): (tag_size, batchsize) + ''' + # Be aware: if a is lookup_parameter with 2 dimension, then a[i] returns one row; + # if b = dy.parameter(a), then b[i] returns one column; which means dy.parameter(a) already transpose a + transpose_transition_score = self.transition_matrix#.expr(update=True) + # transpose_transition_score = dy.transpose(transition_score) + # alpha(t', s) = the score of sequence from t=0 to t=t' in log space + # np_init_alphas = -100.0 * np.ones((self.tag_size, batch_size)) + # np_init_alphas[self.start_id, :] = 0.0 + # alpha_tm1 = dy.inputTensor(np_init_alphas, batched=True) + + alpha_tm1 = transpose_transition_score[self.start_id] + tag_scores[0] + # self.transition_matrix[i]: from i, column + # transpose_score[i]: to i, row + # transpose_score: to, from + + for tag_score in tag_scores[1:]: + # extend for each transit + alpha_tm1 = dy.concatenate_cols([alpha_tm1] * self.tag_size) # (from, to, batch_size) + # each column i of tag_score will be the repeated emission score to tag i + tag_score = dy.transpose(dy.concatenate_cols([tag_score] * self.tag_size)) + alpha_t = alpha_tm1 + transpose_transition_score + tag_score + alpha_tm1 = self._log_sum_exp_dim_0(alpha_t) # (tag_size, batch_size) + + terminal_alpha = self._log_sum_exp_dim_0(alpha_tm1 + self.transition_matrix[self.end_id]) # (1, batch_size) + return terminal_alpha + + def score_one_sequence(self, tag_scores, tags, batch_size): + ''' tags: list of tag ids at each time step ''' + # print tags, batch_size + # print batch_size + # print "scoring one sentence" + tags = [[self.start_id] * batch_size] + tags # len(tag_scores) = len(tags) - 1 + score = dy.inputTensor(np.zeros(batch_size), batched=True) + # tag_scores = dy.concatenate_cols(tag_scores) # tot_tags, sent_len, batch_size + # print "tag dim: ", tag_scores.dim() + for i in range(len(tags) - 1): + score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, tags[i + 1]), tags[i]) \ + + dy.pick_batch(tag_scores[i], tags[i + 1]) + score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, [self.end_id] * batch_size), tags[-1]) + return score + + def _transpose_input(self, seq, padding_token=0): + # input seq: list of samples [[w1, w2, ..], [w1, w2, ..]] + max_len = max([len(sent) for sent in seq]) + seq_pad = [] + seq_mask = [] + for i in range(max_len): + pad_temp = [sent[i] if i < len(sent) else padding_token for sent in seq] + mask_temp = [1.0 if i < len(sent) else 0.0 for sent in seq] + seq_pad.append(pad_temp) + seq_mask.append(mask_temp) + + return seq_pad, seq_mask + + def decode_loss(self, src_encodings, tgt_tags): + # This is the batched version which requires bucketed batch input with the same length. + ''' + The length of src_encodings and tgt_tags are time_steps. + src_encodings: list of dynet.Tensor (src_output_dim, batch_size) + tgt_tags: list of tag ids [(1, batch_size)] + return: average of negative log likelihood + ''' + # TODO: transpose tgt tags first + batch_size = len(tgt_tags) + tgt_tags, tgt_mask = self._transpose_input(tgt_tags, 0) + W_src2tag_readout = self.W_src2tag_readout.expr(update=True) + b_src2tag_readout = self.b_src2tag_readout.expr(update=True) + W_score_tag = self.W_scores_readout2tag.expr(update=True) + b_score_tag = self.b_scores_readout2tag.expr(update=True) + + tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) for src_encoding + in src_encodings] + if self.interpolation: + W_transit = self.W_weight_transition.expr(update=True) + b_transit = self.b_weight_transition.expr(update=True) + step_weight_on_transit = [dy.logistic(dy.affine_transform([b_transit, W_transit, tag_emb])) for tag_emb in + tag_embs] + + tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs] + + # scores over all paths, all scores are in log-space + forward_scores = self.forward_alg(tag_scores) + gold_score = self.score_one_sequence(tag_scores, tgt_tags, batch_size) + # negative log likelihood + loss = dy.sum_batches(forward_scores - gold_score) / batch_size + return loss # , dy.sum_batches(forward_scores)/batch_size, dy.sum_batches(gold_score) / batch_size + + def get_crf_scores(self, src_encodings): + W_src2tag_readout = self.W_src2tag_readout.expr(update=True) + b_src2tag_readout = self.b_src2tag_readout.expr(update=True) + W_score_tag = self.W_scores_readout2tag.expr(update=True) + b_score_tag = self.b_scores_readout2tag.expr(update=True) + + tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) + for src_encoding in src_encodings] + tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs] + + transpose_transition_score = self.transition_matrix#.expr(update=True) # (to, from) + + return transpose_transition_score.npvalue(), [ts.npvalue() for ts in tag_scores] + + def decoding(self, src_encodings): + ''' Viterbi decoding for a single sequence. ''' + W_src2tag_readout = self.W_src2tag_readout.expr(update=True) + b_src2tag_readout = self.b_src2tag_readout.expr(update=True) + W_score_tag = self.W_scores_readout2tag.expr(update=True) + b_score_tag = self.b_scores_readout2tag.expr(update=True) + + tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) + for src_encoding in src_encodings] + tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs] + + back_trace_tags = [] + np_init_alpha = np.ones(self.tag_size) * -2000.0 + np_init_alpha[self.start_id] = 0.0 + max_tm1 = dy.inputTensor(np_init_alpha) + transpose_transition_score = self.transition_matrix#.expr(update=True) # (to, from) + + for i, tag_score in enumerate(tag_scores): + max_tm1 = dy.concatenate_cols([max_tm1] * self.tag_size) + max_t = max_tm1 + transpose_transition_score + if i != 0: + eval_score = max_t.npvalue()[:-2, :] + else: + eval_score = max_t.npvalue() + best_tag = np.argmax(eval_score, axis=0) + back_trace_tags.append(best_tag) + max_tm1 = dy.inputTensor(eval_score[best_tag, range(self.tag_size)]) + tag_score + + terminal_max_T = max_tm1 + self.transition_matrix[self.end_id] + eval_terminal = terminal_max_T.npvalue()[:-2] + best_tag = np.argmax(eval_terminal, axis=0) + best_path_score = eval_terminal[best_tag] + + best_path = [best_tag] + for btpoint in reversed(back_trace_tags): + best_tag = btpoint[best_tag] + best_path.append(best_tag) + start = best_path.pop() + assert start == self.start_id + best_path.reverse() + return best_path_score, best_path + + def cal_accuracy(self, pred_path, true_path): + return np.sum(np.equal(pred_path, true_path).astype(np.float32)) / len(pred_path) + + +class CRFLabeler: + def __init__(self, tagset_size, num_lstm_layers, hidden_dim, input_dim, model=None): + if model is None: + self.model = dy.Model() + else: + self.model = model + self.tagset_size = tagset_size + 2 + + self.START = tagset_size + self.STOP = tagset_size + 1 + + # LSTM parameters + self.bi_lstm = dy.BiRNNBuilder(num_lstm_layers, input_dim, hidden_dim, self.model, dy.LSTMBuilder) + + # Matrix that maps from Bi-LSTM output to num tags + self.lstm_to_tags_params = self.model.add_parameters((self.tagset_size, hidden_dim)) + self.lstm_to_tags_bias = self.model.add_parameters(self.tagset_size) + self.mlp_out = self.model.add_parameters((self.tagset_size, self.tagset_size)) + self.mlp_out_bias = self.model.add_parameters(self.tagset_size) + + # Transition matrix for tagging layer, [i,j] is score of transitioning to i from j + self.transitions = self.model.add_lookup_parameters((self.tagset_size, self.tagset_size)) + + def learn(self, input, tags): + return self._neg_log_loss(input, tags) + + def tag(self, input): + return self._viterbi_decoding(self._build_tagging_graph(input))[0] + + def set_dropout(self, p): + self.bi_lstm.set_dropout(p) + + def disable_dropout(self): + self.bi_lstm.disable_dropout() + + def _build_tagging_graph(self, sentence): + # embeddings = [self.word_rep(w) for w in sentence] + embeddings = sentence + + lstm_out = self.bi_lstm.transduce(embeddings) + + H = self.lstm_to_tags_params.expr(update=True) + Hb = self.lstm_to_tags_bias.expr(update=True) + O = self.mlp_out.expr(update=True) + Ob = self.mlp_out_bias.expr(update=True) + # H = dy.parameter(self.lstm_to_tags_params) + # Hb = dy.parameter(self.lstm_to_tags_bias) + # O = dy.parameter(self.mlp_out) + # Ob = dy.parameter(self.mlp_out_bias) + scores = [] + for rep in lstm_out: + score_t = O * dy.tanh(H * rep + Hb) + Ob + scores.append(score_t) + + return scores + + def _score_sentence(self, observations, tags): + assert len(observations) == len(tags) + score_seq = [0] + score = dy.scalarInput(0) + tags = [self.START] + tags + for i, obs in enumerate(observations): + score = score + dy.pick(self.transitions[tags[i + 1]], tags[i]) + dy.pick(obs, tags[i + 1]) + score_seq.append(score.value()) + score = score + dy.pick(self.transitions[self.STOP], tags[-1]) + return score + + def _viterbi_loss(self, observations, tags): + # observations = self.build_tagging_graph(sentence) + viterbi_tags, viterbi_score = self._viterbi_decoding(observations) + if viterbi_tags != tags: + gold_score = self._score_sentence(observations, tags) + return (viterbi_score - gold_score), viterbi_tags + else: + return dy.scalarInput(0), viterbi_tags + + def _neg_log_loss(self, sentence, tags): + observations = self._build_tagging_graph(sentence) + gold_score = self._score_sentence(observations, tags) + forward_score = self._forward(observations) + return forward_score - gold_score + + def _forward(self, observations): + + def log_sum_exp(scores): + npval = scores.npvalue() + argmax_score = np.argmax(npval) + max_score_expr = dy.pick(scores, argmax_score) + max_score_expr_broadcast = dy.concatenate([max_score_expr] * self.tagset_size) + return max_score_expr + dy.log(dy.sum_dim(dy.transpose(dy.exp(scores - max_score_expr_broadcast)), [1])) + + init_alphas = [-1e10] * self.tagset_size + init_alphas[self.START] = 0 + for_expr = dy.inputVector(init_alphas) + for obs in observations: + alphas_t = [] + for next_tag in range(self.tagset_size): + obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.tagset_size) + next_tag_expr = for_expr + self.transitions[next_tag] + obs_broadcast + alphas_t.append(log_sum_exp(next_tag_expr)) + for_expr = dy.concatenate(alphas_t) + terminal_expr = for_expr + self.transitions[self.STOP] + alpha = log_sum_exp(terminal_expr) + return alpha + + def _viterbi_decoding(self, observations): + backpointers = [] + init_vvars = [-1e10] * self.tagset_size + init_vvars[self.START] = 0 # has all the probability + for_expr = dy.inputVector(init_vvars) + trans_exprs = [self.transitions[idx] for idx in range(self.tagset_size)] + for obs in observations: + bptrs_t = [] + vvars_t = [] + for next_tag in range(self.tagset_size): + next_tag_expr = for_expr + trans_exprs[next_tag] + next_tag_arr = next_tag_expr.npvalue() + best_tag_id = np.argmax(next_tag_arr) + bptrs_t.append(best_tag_id) + vvars_t.append(dy.pick(next_tag_expr, best_tag_id)) + for_expr = dy.concatenate(vvars_t) + obs + backpointers.append(bptrs_t) + # Perform final transition to terminal + terminal_expr = for_expr + trans_exprs[self.STOP] + terminal_arr = terminal_expr.npvalue() + best_tag_id = np.argmax(terminal_arr) + path_score = dy.pick(terminal_expr, best_tag_id) + # Reverse over the backpointers to get the best path + best_path = [best_tag_id] # Start with the tag that was best for terminal + for bptrs_t in reversed(backpointers): + best_tag_id = bptrs_t[best_tag_id] + best_path.append(best_tag_id) + start = best_path.pop() # Remove the start symbol + best_path.reverse() + assert start == self.START + # Return best path and best path's score + return best_path, path_score diff --git a/cube/generic_networks/lemmatizers.py b/_cube/generic_networks/lemmatizers.py similarity index 100% rename from cube/generic_networks/lemmatizers.py rename to _cube/generic_networks/lemmatizers.py diff --git a/cube/generic_networks/ner.py b/_cube/generic_networks/ner.py similarity index 100% rename from cube/generic_networks/ner.py rename to _cube/generic_networks/ner.py diff --git a/_cube/generic_networks/parsers.py b/_cube/generic_networks/parsers.py new file mode 100644 index 000000000..2d2016fe2 --- /dev/null +++ b/_cube/generic_networks/parsers.py @@ -0,0 +1,406 @@ +# +# Author: Tiberiu Boros +# +# Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import random +import dynet as dy +from cube.generic_networks.character_embeddings import CharacterNetwork +from cube.graph.decoders import GreedyDecoder +from cube.generic_networks.utils import orthonormal_VanillaLSTMBuilder +import copy +import sys + + +class BDRNNParser: + def __init__(self, parser_config, encodings, num_languages=1, aux_softmax_weight=0.2, runtime=False): + self.config = parser_config + self.encodings = encodings + self.decoder = GreedyDecoder() + + self.model = dy.Model() + + # self.trainer = dy.SimpleSGDTrainer(self.model) + # self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) + self.trainer = dy.AdamTrainer(self.model) + + self.trainer.set_sparse_updates(False) + self.character_network = CharacterNetwork(100, encodings, rnn_size=200, rnn_layers=1, + embeddings_size=self.config.input_embeddings_size, + model=self.model, runtime=runtime) + + self.holistic_embeddings = self.model.add_lookup_parameters( + (len(self.encodings.word2int), self.config.input_embeddings_size)) + + self.lang_embeddings = self.model.add_lookup_parameters((num_languages, self.config.input_embeddings_size)) + self.padd_embeddings = self.model.add_lookup_parameters( + (2, self.config.input_embeddings_size * 2)) # START/STOP + + self.bdrnn_fw = [] + self.bdrnn_bw = [] + + rnn_input_size = self.config.input_embeddings_size * 2 + + index = 0 + aux_proj_input_size = 0 + for layer_size in self.config.layers: + if runtime: + self.bdrnn_fw.append(dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) + self.bdrnn_bw.append(dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) + else: + self.bdrnn_fw.append(orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) + self.bdrnn_bw.append(orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) + rnn_input_size = layer_size * 2 + self.config.input_embeddings_size + index += 1 + if index == self.config.aux_softmax_layer: + aux_proj_input_size = rnn_input_size + + proj_input_size = self.config.layers[-1] * 2 + self.config.input_embeddings_size + + self.proj_arc_w_head = self.model.add_parameters((self.config.arc_proj_size, proj_input_size)) + self.proj_arc_b_head = self.model.add_parameters((self.config.arc_proj_size)) + self.proj_arc_w_dep = self.model.add_parameters((self.config.arc_proj_size, proj_input_size)) + self.proj_arc_b_dep = self.model.add_parameters((self.config.arc_proj_size)) + self.proj_label_w_head = self.model.add_parameters((self.config.label_proj_size, proj_input_size)) + self.proj_label_b_head = self.model.add_parameters((self.config.label_proj_size)) + self.proj_label_w_dep = self.model.add_parameters((self.config.label_proj_size, proj_input_size)) + self.proj_label_b_dep = self.model.add_parameters((self.config.label_proj_size)) + + self.upos_proj_w = self.model.add_parameters((self.config.label_proj_size, aux_proj_input_size)) + self.xpos_proj_w = self.model.add_parameters((self.config.label_proj_size, aux_proj_input_size)) + self.attrs_proj_w = self.model.add_parameters((self.config.label_proj_size, aux_proj_input_size)) + self.upos_proj_b = self.model.add_parameters((self.config.label_proj_size)) + self.xpos_proj_b = self.model.add_parameters((self.config.label_proj_size)) + self.attrs_proj_b = self.model.add_parameters((self.config.label_proj_size)) + + self.link_b = self.model.add_parameters((1, self.config.arc_proj_size)) + self.link_w = self.model.add_parameters((self.config.arc_proj_size, self.config.arc_proj_size)) + + self.label_ww = self.model.add_parameters((1, len(self.encodings.label2int))) + self.label_w = self.model.add_parameters( + (len(self.encodings.label2int), self.config.label_proj_size * 2 + self.config.input_embeddings_size)) + self.label_bb = self.model.add_parameters((len(self.encodings.label2int))) + + self.upos_softmax_w = self.model.add_parameters((len(self.encodings.upos2int), self.config.label_proj_size)) + self.xpos_softmax_w = self.model.add_parameters((len(self.encodings.xpos2int), self.config.label_proj_size)) + self.attrs_softmax_w = self.model.add_parameters( + (len(self.encodings.attrs2int), self.config.label_proj_size)) + + self.upos_softmax_b = self.model.add_parameters((len(self.encodings.upos2int))) + self.xpos_softmax_b = self.model.add_parameters((len(self.encodings.xpos2int))) + self.attrs_softmax_b = self.model.add_parameters((len(self.encodings.attrs2int))) + + self.aux_softmax_weight = aux_softmax_weight + self.batch_loss = [] + + def start_batch(self): + dy.renew_cg() + self.batch_loss = [] + + def end_batch(self): + if len(self.batch_loss) > 0: + loss = dy.esum(self.batch_loss) + loss_val = loss.value() + loss.backward() + self.trainer.update() + return loss_val + else: + return 0 + + def learn(self, seq, lang_id=0): + # remove compound words + tmp = [] + for ss in seq: + if not ss.is_compound_entry: + tmp.append(ss) + seq = tmp + arc_matrix, proj_labels, softmax_morphology = self._predict_arc(seq, lang_id, + runtime=False) + gold_heads = [entry.head for entry in seq] + gold_labels = [entry.label for entry in seq] + + softmax_labels = self._predict_label(gold_heads, proj_labels, lang_id, runtime=False) + + losses = [] + + for gold_head, gold_label, arc_probs, softmax_label, entry in zip(gold_heads, gold_labels, + arc_matrix[1:], + softmax_labels, seq): + label_index = self.encodings.label2int[gold_label] + losses.append(-dy.log(arc_probs[gold_head])) + losses.append(-dy.log(dy.pick(softmax_label, label_index))) + + for softmax_morph, entry in zip(softmax_morphology, seq): + loss_upos = -dy.log(dy.pick(softmax_morph[0], self.encodings.upos2int[entry.upos])) + losses.append(loss_upos * (self.aux_softmax_weight / 3)) + + if len( + self.encodings.xpos2int) > 1: # stability check (some languages are missing attributes or XPOS, resulting in numerical overflow during backpropagation + loss_xpos = -dy.log(dy.pick(softmax_morph[1], self.encodings.xpos2int[entry.xpos])) + losses.append(loss_xpos * (self.aux_softmax_weight / 3)) + + if len( + self.encodings.attrs2int) > 1: # stability check (some languages are missing attributes or XPOS, resulting in numerical overflow during backpropagation + loss_attrs = -dy.log(dy.pick(softmax_morph[2], self.encodings.attrs2int[entry.attrs])) + losses.append(loss_attrs * (self.aux_softmax_weight / 3)) + + loss = dy.esum(losses) / len(losses) + self.batch_loss.append(loss) + + def _attend(self, input_vectors, state, aux_embeddings): + w1 = self.lemma_att_w1.expr(update=True) + w2 = self.lemma_att_w2.expr(update=True) + v = self.lemma_att_v.expr(update=True) + attention_weights = [] + + w2dt = w2 * dy.concatenate([state.h()[-1], aux_embeddings]) + for input_vector in input_vectors: + attention_weight = v * dy.tanh(w1 * input_vector + w2dt) + attention_weights.append(attention_weight) + + attention_weights = dy.softmax(dy.concatenate(attention_weights)) + + output_vectors = dy.esum( + [vector * attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) + + return output_vectors + + def tag(self, seq, lang_id=0): + tmp = [] + for ss in seq: + if not ss.is_compound_entry: + tmp.append(ss) + + seq = tmp + + dy.renew_cg() + arc_matrix, proj_labels, softmax_morphology = self._predict_arc(seq, lang_id) + pred_heads = self.decoder.decode(arc_matrix) + softmax_labels = self._predict_label(pred_heads, proj_labels, lang_id) + + tag_list = [] + for pred_head, softmax_label in zip(pred_heads, softmax_labels): + label_index = np.argmax(softmax_label.npvalue()) + tag = ParserTag(pred_head, self.encodings.labels[label_index], None, None, None) + tag_list.append(tag) + + if self.config.predict_morphology: + for tag, softmax_morph in zip(tag_list, softmax_morphology): + tag.upos = self.encodings.upos_list[np.argmax(softmax_morph[0].npvalue())] + tag.xpos = self.encodings.xpos_list[np.argmax(softmax_morph[1].npvalue())] + tag.attrs = self.encodings.attrs_list[np.argmax(softmax_morph[2].npvalue())] + + return tag_list + + def _predict_label(self, heads, proj_labels, lang_id, runtime=True): + s_labels = [] + lang_emb = self.lang_embeddings[lang_id] + for iDep, iHead in zip(range(1, len(heads) + 1), heads): + modw = dy.transpose( + dy.reshape(proj_labels[iHead][1], (self.config.label_proj_size, 1)) * self.label_ww.expr(update=True)) + term1 = modw * proj_labels[iDep][0] + term2 = self.label_w.expr(update=True) * dy.concatenate( + [proj_labels[iHead][1], proj_labels[iDep][0], lang_emb]) + term3 = self.label_bb.expr(update=True) + s_labels.append(dy.softmax(term1 + term2 + term3)) + + return s_labels + + def _make_input(self, seq, lang_id, runtime): + x_list = [] + encoder_states_list = [None] + lang_emb = self.lang_embeddings[lang_id] + # add the root + + x_list.append(self.padd_embeddings[0]) + + for entry in seq: + word = entry.word + + # prepare lexical embeddings + char_emb, encoder_states = self.character_network.compute_embeddings(word, runtime=runtime, + language_embeddings=lang_emb) + encoder_states_list.append(encoder_states) + + word = word.lower() + + if word in self.encodings.word2int: + holistic_emb = self.holistic_embeddings[self.encodings.word2int[word]] + else: + holistic_emb = self.holistic_embeddings[self.encodings.word2int['']] + + # dropout lexical embeddings + if runtime: + w_emb = char_emb + holistic_emb + else: + p1 = random.random() + p2 = random.random() + + m1 = 1 + m2 = 1 + + if p1 < self.config.input_dropout_prob: + m1 = 0 + if p2 < self.config.input_dropout_prob: + m2 = 0 + scale = 1.0 + if m1 + m2 > 0: + scale = float(2) / (m1 + m2) + m1 = dy.scalarInput(m1) + m2 = dy.scalarInput(m2) + + scale = dy.scalarInput(scale) + w_emb = (char_emb * m1 + holistic_emb * m2) * scale + + x_list.append(dy.concatenate([w_emb, lang_emb])) + + # close sequence + x_list.append(self.padd_embeddings[1]) + + encoder_states_list.append(None) + return x_list, encoder_states_list + + def _predict_arc(self, seq, lang_id, runtime=True): + x_list, encoder_states_list = self._make_input(seq, lang_id, runtime) + lang_emb = self.lang_embeddings[lang_id] + # BDLSTM + rnn_outputs = [x_list] + for fw, bw, dropout in zip(self.bdrnn_fw, self.bdrnn_bw, self.config.layer_dropouts): + if runtime: + fw.set_dropouts(0, 0) + bw.set_dropouts(0, 0) + else: + fw.set_dropouts(dropout, dropout) + bw.set_dropouts(dropout, dropout) + + fw_list = fw.initial_state().transduce(x_list) + bw_list = list(reversed(bw.initial_state().transduce(reversed(x_list)))) + x_list = [dy.concatenate([x_fw, x_bw, lang_emb]) for x_fw, x_bw in zip(fw_list, bw_list)] + + rnn_outputs.append(x_list) + + # projections + arc_projections = [[dy.tanh(self.proj_arc_w_dep.expr(update=True) * x + self.proj_arc_b_dep.expr(update=True)), + dy.tanh( + self.proj_arc_w_head.expr(update=True) * x + self.proj_arc_b_head.expr(update=True))] + for x in + rnn_outputs[-1]] + label_projections = [ + [dy.tanh(self.proj_label_w_dep.expr(update=True) * x + self.proj_label_b_dep.expr(update=True)), + dy.tanh(self.proj_label_w_head.expr(update=True) * x + self.proj_label_b_head.expr(update=True))] for x in + rnn_outputs[-1]] + if not runtime: + arc_projections = [ + [dy.dropout(x1, self.config.presoftmax_mlp_dropout), dy.dropout(x2, self.config.presoftmax_mlp_dropout)] + for x1, x2 in arc_projections] + label_projections = [ + [dy.dropout(x1, self.config.presoftmax_mlp_dropout), dy.dropout(x2, self.config.presoftmax_mlp_dropout)] + for x1, x2 in label_projections] + + drp = self.config.presoftmax_mlp_dropout + if runtime: + drp = 0 + upos_softmax = [dy.softmax(self.upos_softmax_w.expr(update=True) * dy.dropout(dy.tanh( + self.upos_proj_w.expr(update=True) * x + self.upos_proj_b.expr(update=True)), + drp) + self.upos_softmax_b.expr(update=True)) for x in + rnn_outputs[self.config.aux_softmax_layer]] + xpos_softmax = [dy.softmax(self.xpos_softmax_w.expr(update=True) * dy.dropout(dy.tanh( + self.xpos_proj_w.expr(update=True) * x + self.xpos_proj_b.expr(update=True)), + drp) + self.xpos_softmax_b.expr(update=True)) for x in + rnn_outputs[self.config.aux_softmax_layer]] + attrs_softmax = [dy.softmax(self.attrs_softmax_w.expr(update=True) * dy.dropout(dy.tanh( + self.attrs_proj_w.expr(update=True) * x + self.attrs_proj_b.expr(update=True)), + drp) + self.attrs_softmax_b.expr(update=True)) for x in + rnn_outputs[self.config.aux_softmax_layer]] + + morphology_softmax = [[upos, xpos, attrs] for + upos, xpos, attrs in + zip(upos_softmax, xpos_softmax, attrs_softmax)] + + n = len(seq) + 1 + arc_matrix = [[None] * n for _ in range(n)] + + for iDst in range(n): + term_bias = self.link_b.expr(update=True) * arc_projections[iDst][1] + term_weight = self.link_w.expr(update=True) * arc_projections[iDst][1] + + for iSrc in range(n): + if iSrc != iDst: + attention = dy.reshape(term_weight, (1, self.config.arc_proj_size)) * arc_projections[iSrc][ + 0] + term_bias + arc_matrix[iSrc][iDst] = attention + + # compute softmax for arcs + a_m = [[None] * n for _ in range(n)] + if not self.config.predict_morphology: + aux_a_m = [[None] * n for _ in range(n)] + + for iSrc in range(n): + s_max = [] + if not self.config.predict_morphology: + aux_s_max = [] + for iDst in range(n): + if iSrc != iDst: + s_max.append(arc_matrix[iSrc][iDst]) + + s_max = dy.softmax(dy.concatenate(s_max)) + + ofs = 0 + for iDst in range(n): + if iSrc == iDst: + ofs = -1 + else: + a_m[iSrc][iDst] = s_max[iDst + ofs] + + return a_m, label_projections, morphology_softmax[1:-1] + + def save(self, path): + self.model.save(path) + + def load(self, path): + self.model.populate(path) + + def parse_sequences(self, sequences): + new_sequences = [] + for sequence, lang_id in sequences: + new_sequence = copy.deepcopy(sequence) + predicted_tags = self.tag(new_sequence, lang_id=lang_id) + iOrig, iTags = 0, 0 + while iOrig < len(new_sequence): + while new_sequence[iOrig].is_compound_entry: + iOrig += 1 + new_sequence[iOrig].head = predicted_tags[iTags].head + new_sequence[iOrig].label = predicted_tags[iTags].label + if self.config.predict_morphology == True: + new_sequence[iOrig].upos = predicted_tags[iTags].upos + new_sequence[iOrig].xpos = predicted_tags[iTags].xpos + new_sequence[iOrig].attrs = predicted_tags[iTags].attrs + iTags += 1 + iOrig += 1 + + new_sequences.append(new_sequence) + return new_sequences + + +class ParserTag: + def __init__(self, head, label, upos=None, xpos=None, attrs=None, lemma=None): + self.head = head + self.label = label + self.upos = upos + self.xpos = xpos + self.attrs = attrs + self.lemma = lemma diff --git a/cube/generic_networks/taggers.py b/_cube/generic_networks/taggers.py similarity index 78% rename from cube/generic_networks/taggers.py rename to _cube/generic_networks/taggers.py index 116a15d4f..f528cd35e 100644 --- a/cube/generic_networks/taggers.py +++ b/_cube/generic_networks/taggers.py @@ -27,26 +27,27 @@ class BDRNNTagger: - def __init__(self, tagger_config, encodings, embeddings, aux_softmax_weight=0.2, runtime=False): + def __init__(self, tagger_config, encodings, aux_softmax_weight=0.2, runtime=False, num_languages=1): self.config = tagger_config self.encodings = encodings - self.embeddings = embeddings self.model = dy.Model() - self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, - beta_2=0.9) # dy.MomentumSGDTrainer(self.model) + self.trainer = dy.AdamTrainer(self.model) # dy.MomentumSGDTrainer(self.model) self.trainer.set_sparse_updates(False) - self.character_network = CharacterNetwork(100, encodings, rnn_size=200, rnn_layers=1, - embeddings_size=self.embeddings.word_embeddings_size, - model=self.model, runtime=runtime) + self.character_network = CharacterNetwork(100, encodings, rnn_size=200, rnn_layers=2, + embeddings_size=self.config.input_size, + model=self.model, runtime=runtime, + lang_embeddings_size=self.config.language_embedding_size) - self.unknown_word_embedding = self.model.add_lookup_parameters((1, self.embeddings.word_embeddings_size)) + self.unknown_word_embedding = self.model.add_lookup_parameters((1, self.config.input_size)) self.holistic_word_embedding = self.model.add_lookup_parameters( - (len(encodings.word2int), self.embeddings.word_embeddings_size)) + (len(encodings.word2int), self.config.input_size)) + self.language_embeddings = self.model.add_lookup_parameters( + (num_languages, self.config.language_embedding_size)) - self.char_proj_w = self.model.add_parameters((self.config.input_size, self.embeddings.word_embeddings_size)) - self.emb_proj_w = self.model.add_parameters((self.config.input_size, self.embeddings.word_embeddings_size)) - self.hol_proj_w = self.model.add_parameters((self.config.input_size, self.embeddings.word_embeddings_size)) + self.char_proj_w = self.model.add_parameters((self.config.input_size, self.config.input_size)) + self.hol_proj_w = self.model.add_parameters((self.config.input_size, self.config.input_size)) + self.lang_proj_w = self.model.add_parameters((self.config.input_size, self.config.language_embedding_size)) self.bdrnn_fw = [] self.bdrnn_bw = [] @@ -61,7 +62,7 @@ def __init__(self, tagger_config, encodings, embeddings, aux_softmax_weight=0.2, else: self.bdrnn_fw.append(orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) self.bdrnn_bw.append(orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) - rnn_input_size = layer_size * 2 + rnn_input_size = layer_size * 2 + self.config.language_embedding_size index += 1 if index == self.config.aux_softmax_layer: aux_softmax_input_size = rnn_input_size @@ -70,7 +71,7 @@ def __init__(self, tagger_config, encodings, embeddings, aux_softmax_weight=0.2, for _ in range(3): # upos, xpos and attrs mlp_w = [] mlp_b = [] - input_sz = self.config.layers[-1] * 2 + input_sz = self.config.layers[-1] * 2 + self.config.language_embedding_size for l_size in self.config.presoftmax_mlp_layers: mlp_w.append(self.model.add_parameters((l_size, input_sz))) mlp_b.append(self.model.add_parameters((l_size))) @@ -95,9 +96,9 @@ def __init__(self, tagger_config, encodings, embeddings, aux_softmax_weight=0.2, self.aux_softmax_weight = aux_softmax_weight self.losses = [] - def tag(self, seq): + def tag(self, seq, lang_id=0): dy.renew_cg() - softmax_list, aux_softmax_list = self._predict(seq) + softmax_list, aux_softmax_list = self._predict(seq, lang_id=lang_id) label_list = [] for softmax in softmax_list: label_list.append([self.encodings.upos_list[np.argmax(softmax[0].npvalue())], @@ -105,9 +106,9 @@ def tag(self, seq): self.encodings.attrs_list[np.argmax(softmax[2].npvalue())]]) return label_list - def learn(self, seq): + def learn(self, seq, lang_id=0): # dy.renew_cg() - softmax_list, aux_softmax_list = self._predict(seq, runtime=False) + softmax_list, aux_softmax_list = self._predict(seq, lang_id=lang_id, runtime=False) losses = [] for entry, softmax, aux_softmax in zip(seq, softmax_list, aux_softmax_list): upos_index = self.encodings.upos2int[entry.upos] @@ -142,22 +143,14 @@ def end_batch(self): self.trainer.update() return total_loss_val - def _predict(self, seq, runtime=True): + def _predict(self, seq, runtime=True, lang_id=0): softmax_list = [] aux_softmax_list = [] x_list = [] + lang_emb = self.language_embeddings[lang_id] for entry in seq: word = entry.word - char_emb, _ = self.character_network.compute_embeddings(word, runtime=runtime) - import sys - if sys.version_info[0] == 2: - word_emb, found = self.embeddings.get_word_embeddings(word.decode('utf-8')) - else: - word_emb, found = self.embeddings.get_word_embeddings(word) - if not found: - word_emb = self.unknown_word_embedding[0] - else: - word_emb = dy.inputVector(word_emb) + char_emb, _ = self.character_network.compute_embeddings(word, runtime=runtime, language_embeddings=lang_emb) if sys.version_info[0] == 2: holistic_word = word.decode('utf-8').lower() else: @@ -166,20 +159,22 @@ def _predict(self, seq, runtime=True): hol_emb = self.holistic_word_embedding[self.encodings.word2int[holistic_word]] else: hol_emb = self.holistic_word_embedding[self.encodings.word2int['']] - proj_emb = self.emb_proj_w.expr(update=True) * word_emb proj_hol = self.hol_proj_w.expr(update=True) * hol_emb proj_char = self.char_proj_w.expr(update=True) * char_emb + proj_lang = self.lang_proj_w.expr(update=True) * lang_emb # x_list.append(dy.tanh(proj_char + proj_emb + proj_hol)) if runtime: - x_list.append(dy.tanh(proj_char + proj_emb + proj_hol)) + x_list.append(dy.tanh(proj_char + proj_hol + proj_lang)) else: p1 = random.random() p2 = random.random() p3 = random.random() + p4 = random.random() m1 = 1 m2 = 1 m3 = 1 + if p1 < self.config.input_dropout_prob: m1 = 0 if p2 < self.config.input_dropout_prob: @@ -194,7 +189,7 @@ def _predict(self, seq, runtime=True): m2 = dy.scalarInput(m2) m3 = dy.scalarInput(m3) scale = dy.scalarInput(scale) - x_list.append(dy.tanh((proj_char * m1 + proj_emb * m2 + proj_hol * m3) * scale)) + x_list.append(dy.tanh((proj_char * m1 + proj_hol * m2 + proj_lang * m3) * scale)) # BDLSTM rnn_outputs = [] @@ -207,7 +202,7 @@ def _predict(self, seq, runtime=True): bw.set_dropouts(0, 0) fw_list = fw.initial_state().transduce(x_list) bw_list = list(reversed(bw.initial_state().transduce(reversed(x_list)))) - x_list = [dy.concatenate([x_fw, x_bw]) for x_fw, x_bw in zip(fw_list, bw_list)] + x_list = [dy.concatenate([x_fw, x_bw, lang_emb]) for x_fw, x_bw in zip(fw_list, bw_list)] # if runtime: # x_out = x_list # else: @@ -230,14 +225,22 @@ def _predict(self, seq, runtime=True): mlp_output.append(pre_softmax) for softmax_inp, aux_softmax_inp in zip(mlp_output, rnn_outputs[self.config.aux_softmax_layer - 1]): - softmax_list.append([dy.softmax(self.softmax_upos_w.expr(update=True) * softmax_inp[0] + self.softmax_upos_b.expr(update=True)), - dy.softmax(self.softmax_xpos_w.expr(update=True) * softmax_inp[1] + self.softmax_xpos_b.expr(update=True)), - dy.softmax( - self.softmax_attrs_w.expr(update=True) * softmax_inp[2] + self.softmax_attrs_b.expr(update=True))]) + softmax_list.append([dy.softmax( + self.softmax_upos_w.expr(update=True) * softmax_inp[0] + self.softmax_upos_b.expr(update=True)), + dy.softmax( + self.softmax_xpos_w.expr(update=True) * softmax_inp[1] + self.softmax_xpos_b.expr( + update=True)), + dy.softmax( + self.softmax_attrs_w.expr(update=True) * softmax_inp[ + 2] + self.softmax_attrs_b.expr(update=True))]) aux_softmax_list.append( - [dy.softmax(self.aux_softmax_upos_w.expr(update=True) * aux_softmax_inp + self.aux_softmax_upos_b.expr(update=True)), - dy.softmax(self.aux_softmax_xpos_w.expr(update=True) * aux_softmax_inp + self.aux_softmax_xpos_b.expr(update=True)), - dy.softmax(self.aux_softmax_attrs_w.expr(update=True) * aux_softmax_inp + self.aux_softmax_attrs_b.expr(update=True))]) + [dy.softmax(self.aux_softmax_upos_w.expr(update=True) * aux_softmax_inp + self.aux_softmax_upos_b.expr( + update=True)), + dy.softmax(self.aux_softmax_xpos_w.expr(update=True) * aux_softmax_inp + self.aux_softmax_xpos_b.expr( + update=True)), + dy.softmax( + self.aux_softmax_attrs_w.expr(update=True) * aux_softmax_inp + self.aux_softmax_attrs_b.expr( + update=True))]) return softmax_list, aux_softmax_list @@ -249,9 +252,9 @@ def load(self, path): def tag_sequences(self, sequences): new_sequences = [] - for sequence in sequences: + for sequence, lang_id in sequences: new_sequence = copy.deepcopy(sequence) - predicted_tags = self.tag(new_sequence) + predicted_tags = self.tag(new_sequence, lang_id=lang_id) for entryIndex, pred in enumerate(predicted_tags): new_sequence[entryIndex].upos = pred[0] new_sequence[entryIndex].xpos = pred[1] diff --git a/cube/generic_networks/token_expanders.py b/_cube/generic_networks/token_expanders.py similarity index 100% rename from cube/generic_networks/token_expanders.py rename to _cube/generic_networks/token_expanders.py diff --git a/cube/generic_networks/tokenizers.py b/_cube/generic_networks/tokenizers.py similarity index 66% rename from cube/generic_networks/tokenizers.py rename to _cube/generic_networks/tokenizers.py index 23b0dbd78..7d530b0b4 100644 --- a/cube/generic_networks/tokenizers.py +++ b/_cube/generic_networks/tokenizers.py @@ -26,6 +26,478 @@ from cube.misc.misc import get_eta, pretty_time, log_progress, line_count from cube.io_utils.conll import ConllEntry +from cube.generic_networks.crf import CRFDecoder + + +class DummyTokenizer: + def __init__(self, config, encodings, num_languages=1, dict=None, runtime=False): + self.model = dy.Model() + self.trainer = dy.AdamTrainer(self.model) + self.encodings = encodings + self.config = config + self.dict = dict + self.char_lookup = self.model.add_lookup_parameters((len(self.encodings.char2int), config.char_emb_size)) + self.case_lookup = self.model.add_lookup_parameters((3, 32)) + self.lang_lookup = self.model.add_lookup_parameters((num_languages, config.lang_emb_size)) + + self.LAYER_SIZE = 300 + self.NUM_LAYERS = 15 + self.WINDOW_SIZE = 2 + inp_size = config.lang_emb_size + config.lang_emb_size + 32 + if dict is not None: + inp_size += 1 + self._proj_w = self.model.add_parameters((self.LAYER_SIZE, inp_size)) + self._proj_b = self.model.add_parameters((self.LAYER_SIZE)) + + self._fw_gate_w = [] + self._fw_gate_b = [] + self._fw_act_w = [] + self._fw_act_b = [] + self._fw_skip_w = [] + self._fw_skip_b = [] + self._bw_gate_w = [] + self._bw_gate_b = [] + self._bw_act_w = [] + self._bw_act_b = [] + self._bw_skip_w = [] + self._bw_skip_b = [] + inp_size = self.LAYER_SIZE * (self.WINDOW_SIZE + 1) + for ii in range(self.NUM_LAYERS): + self._fw_gate_w.append(self.model.add_parameters((self.LAYER_SIZE, inp_size))) + self._fw_gate_b.append(self.model.add_parameters((self.LAYER_SIZE))) + self._fw_act_w.append(self.model.add_parameters((self.LAYER_SIZE, inp_size))) + self._fw_act_b.append(self.model.add_parameters((self.LAYER_SIZE))) + self._fw_skip_w.append(self.model.add_parameters((self.LAYER_SIZE, inp_size))) + self._fw_skip_b.append(self.model.add_parameters((self.LAYER_SIZE))) + self._bw_gate_w.append(self.model.add_parameters((self.LAYER_SIZE, inp_size))) + self._bw_gate_b.append(self.model.add_parameters((self.LAYER_SIZE))) + self._bw_act_w.append(self.model.add_parameters((self.LAYER_SIZE, inp_size))) + self._bw_act_b.append(self.model.add_parameters((self.LAYER_SIZE))) + self._bw_skip_w.append(self.model.add_parameters((self.LAYER_SIZE, inp_size))) + self._bw_skip_b.append(self.model.add_parameters((self.LAYER_SIZE))) + inp_size = self.LAYER_SIZE * (self.WINDOW_SIZE + 1) + + self.label2int = {'B': 0, 'I': 1, 'E': 2, 'S': 3, 'X': 4, 'BM': 5, 'IM': 6, 'EM': 7, 'SM': 8, 'T': 9, 'U': 10, + 'UM': 11} + self.label_list = ['B', 'I', 'E', 'S', 'X', 'BM', 'IM', 'EM', 'SM', 'T', 'U', 'UM'] + # self.crf = CRFDecoder(self.model, self.LAYER_SIZE, 100, len(self.label_list)) + self.aux1_softmax_output_w = self.model.add_parameters((len(self.label_list), self.LAYER_SIZE)) + self.aux1_softmax_output_b = self.model.add_parameters((len(self.label_list))) + self.aux2_softmax_output_w = self.model.add_parameters((len(self.label_list), self.LAYER_SIZE)) + self.aux2_softmax_output_b = self.model.add_parameters((len(self.label_list))) + self.softmax_output_w = self.model.add_parameters((len(self.label_list), self.LAYER_SIZE * 2)) + self.softmax_output_b = self.model.add_parameters((len(self.label_list))) + + def _make_input(self, seqs): + chars = [] + tags = [] + + for seq in seqs: + for entry in seq: + for char_idx in range(len(entry.word)): + chars.append(entry.word[char_idx]) + if len(entry.word) == 1: + tags.append('S') + elif char_idx == 0: + tags.append('B') + elif char_idx == len(entry.word) - 1: + tags.append('E') + else: + tags.append('I') + if entry.is_compound_entry: + tags[-1] = tags[-1] + 'M' + + if "spaceafter=no" not in entry.space_after.lower(): + chars.append(' ') + delta = -1 + while tags[delta].startswith('X'): + delta -= 1 # it should never crash if dataset is ok + + if tags[delta].startswith('S'): + tags[delta] = 'T' + else: + append_m = '' + if tags[delta].endswith('M'): + append_m = 'M' + tags[delta] = 'U' + append_m + return chars, tags + + def learn(self, conll_sequences, lang_id=0): + dy.renew_cg() + chars, tags = self._make_input(conll_sequences) + + outputs, aux1, aux2 = self._forward(chars, lang_id=lang_id) + # crf_output = self.crf.tag(outputs) + tgt_tags = [self.label2int[tag] for tag in tags] + + # loss = self.crf.learn(outputs, tgt_tags) / len(tgt_tags) + # from ipdb import set_trace + # set_trace() + + loss_main = dy.esum([-dy.log(dy.pick(output, tgt)) for output, tgt in zip(outputs, tgt_tags)]) / len(tgt_tags) + loss_aux1 = dy.esum([-dy.log(dy.pick(output, tgt)) for output, tgt in zip(aux1, tgt_tags)]) / len(tgt_tags) + loss_aux2 = dy.esum([-dy.log(dy.pick(output, tgt)) for output, tgt in zip(aux2, tgt_tags)]) / len(tgt_tags) + + loss = loss_main + 0.5 * loss_aux1 + 0.5 * loss_aux2 + l_val = loss.value() * len(tgt_tags) + loss.backward() + self.trainer.update() + return l_val + + def _batch_forward(self, chars, lang_id, batch_size=1000): + tags = [] + num_batches = len(chars) // batch_size + if len(chars) % batch_size != 0: + num_batches += 1 + + for ii in range(num_batches): + start = ii * batch_size + stop = ii * batch_size + batch_size + if stop > len(chars): + stop = len(chars) + + cbs = stop - start + offset = 0 + if start > 0: + start -= self.WINDOW_SIZE * self.NUM_LAYERS + offset = self.WINDOW_SIZE * self.NUM_LAYERS + stop = min(len(chars), stop + self.WINDOW_SIZE * self.NUM_LAYERS) + + copy_chars = chars[start:stop] + dy.renew_cg() + # from ipdb import set_trace + # set_trace() + outputs, _, _ = self._forward(copy_chars, lang_id=lang_id, runtime=True) + outputs = outputs[offset:offset + cbs] + + new_tags = [np.argmax(pred.npvalue()) for pred in outputs] + # new_tags = self.crf.tag(outputs) + for tag in new_tags: + tags.append(self.label_list[tag]) + return tags + + def tokenize(self, raw_text, lang_id=0): + # make sequences of approx 2000-4000 chars + BATCH_SIZE = 1000 # len(raw_text) + start = 0 + seqs = [] + seq = [] + word_index = 1 + word = '' + + chars = [c for c in raw_text] + tags = self._batch_forward(chars, batch_size=BATCH_SIZE, lang_id=lang_id) + + for index, char, tag in zip(range(len(tags)), chars, tags): + if not tag.startswith('X'): + word += char + if not tag.startswith('B') and not tag.startswith('X') and not tag.startswith( + 'I') and word.strip() != '': + entry = ConllEntry(word_index, word, '_', '_', '_', '_', word_index - 1, '_', '_', '_') + seq.append(entry) + word_index += 1 + word = '' + if tag.startswith('T') or tag.startswith('U'): + seqs.append(seq) + seq = [] + word_index = 1 + start += len(chars) + + if word.strip() != '': + entry = ConllEntry(word_index, word, '_', '_', '_', '_', word_index - 1, '_', '_', '_') + seq.append(entry) + if len(seq) != 0: + seqs.append(seq) + + return seqs + + def _is_known_word(self, chars, index): + max_win_size = 10 + for zz in range(min(max_win_size, index + 1)): + word = ''.join(chars[index - zz:index + 1]) + if word in self.dict: + return dy.scalarInput(1) + return dy.scalarInput(0) + + def _forward(self, chars, lang_id=0, runtime=True): + inp_size = self.config.char_emb_size + 32 + self.config.lang_emb_size + if self.dict is not None: + inp_size += 1 + inp = [dy.inputVector(np.zeros(inp_size)) for ii in + range(self.WINDOW_SIZE)] + + lang_emb = self.lang_lookup[lang_id] + + for index, char in zip(range(len(chars)), chars): + case_emb = self.case_lookup[0] + if char.lower() == char.upper(): + case_emb = self.case_lookup[1] + elif char.lower() != char: + case_emb = self.case_lookup[2] + if char.lower() in self.encodings.char2int: + char_emb = self.char_lookup[self.encodings.char2int[char.lower()]] + + else: + char_emb = self.char_lookup[self.encodings.char2int['']] + + if self.dict is None: + inp.append(dy.concatenate([case_emb, char_emb, lang_emb])) + else: + inp.append(dy.concatenate([case_emb, char_emb, lang_emb, self._is_known_word(chars, index)])) + + for ii in range(self.WINDOW_SIZE + 1): + inp.append(dy.inputVector(np.zeros((inp_size)))) + + # outputs = [] + + input = [self._proj_w.expr(update=True) * x + self._proj_b.expr(update=True) for x in inp] + inp_fw = input + + skip_conn_fw = [[] for ii in range(len(chars))] + for idx, g_w, g_b, a_w, a_b, skip_w, skip_b in zip(range(len(self._fw_gate_w)), self._fw_gate_w, + self._fw_gate_b, + self._fw_act_w, self._fw_act_b, self._fw_skip_w, + self._fw_skip_b): + new_inp = [dy.inputVector(np.zeros(self.LAYER_SIZE)) for ii in + range(self.WINDOW_SIZE)] + + for ii in range(len(chars)): + hidden = dy.concatenate(inp_fw[ii:ii + self.WINDOW_SIZE + 1]) + skip_conn_fw[ii].append(skip_w.expr(update=True) * hidden + skip_b.expr(update=True)) + + act = dy.tanh(a_w.expr(update=True) * hidden + a_b.expr(update=True)) + gate = dy.logistic(g_w.expr(update=True) * hidden + g_b.expr(update=True)) + output = dy.cmult(act, gate) + dy.cmult(1.0 - gate, inp_fw[ii + self.WINDOW_SIZE]) + + if not runtime: + output = dy.dropout(output, 0.25) + new_inp.append(output) + + for ii in range(self.WINDOW_SIZE + 1): + new_inp.append(dy.inputVector(np.zeros((self.LAYER_SIZE)))) + inp_fw = new_inp + + inp_bw = input + + skip_conn_bw = [[] for ii in range(len(chars))] + for idx, g_w, g_b, a_w, a_b, skip_w, skip_b in zip(range(len(self._bw_gate_w)), self._bw_gate_w, + self._bw_gate_b, + self._bw_act_w, self._bw_act_b, self._bw_skip_w, + self._bw_skip_b): + new_inp = [dy.inputVector(np.zeros(self.LAYER_SIZE)) for ii in + range(self.WINDOW_SIZE)] + + for ii in range(len(chars)): + hidden = dy.concatenate(inp_bw[ii + self.WINDOW_SIZE: ii + self.WINDOW_SIZE * 2 + 1]) + skip_conn_bw[ii].append(skip_w.expr(update=True) * hidden + skip_b.expr(update=True)) + + act = dy.tanh(a_w.expr(update=True) * hidden + a_b.expr(update=True)) + gate = dy.logistic(g_w.expr(update=True) * hidden + g_b.expr(update=True)) + output = dy.cmult(act, gate) + dy.cmult(1.0 - gate, inp_bw[ii + self.WINDOW_SIZE]) + + if not runtime: + output = dy.dropout(output, 0.25) + new_inp.append(output) + + for ii in range(self.WINDOW_SIZE + 1): + new_inp.append(dy.inputVector(np.zeros((self.LAYER_SIZE)))) + inp_bw = new_inp + + outputs_aux1 = [ + dy.softmax( + self.aux1_softmax_output_w.expr(update=True) * dy.rectify( + dy.esum(res)) + self.aux1_softmax_output_b.expr( + update=True)) for hidden, res in zip(inp_fw[self.WINDOW_SIZE:], skip_conn_fw)] + outputs_aux2 = [ + dy.softmax( + self.aux2_softmax_output_w.expr(update=True) * dy.rectify( + dy.esum(res)) + self.aux2_softmax_output_b.expr( + update=True)) for hidden, res in zip(inp_bw[self.WINDOW_SIZE:], skip_conn_bw)] + + outputs = [ + dy.softmax( + self.softmax_output_w.expr(update=True) * dy.rectify( + dy.concatenate([dy.esum(resfw), dy.esum(resbw)])) + self.softmax_output_b.expr( + update=True)) for hidden, resfw, resbw in + zip(inp_fw[self.WINDOW_SIZE:], skip_conn_fw, skip_conn_bw)] + # outputs = inp[self.window_size:] + return outputs, outputs_aux1, outputs_aux2 + + def save(self, filename): + self.model.save(filename) + + def load(self, filename): + self.model.populate(filename) + + +class CRFTokenizer: + def __init__(self, config, encodings, num_languages=1, runtime=False): + self.model = dy.Model() + self.trainer = dy.AdamTrainer(self.model) + self.encodings = encodings + self.config = config + self.char_lookup = self.model.add_lookup_parameters((len(self.encodings.char2int), config.char_emb_size)) + self.case_lookup = self.model.add_lookup_parameters((3, 32)) + input_size = 32 + config.char_emb_size + + self.lang_lookup = self.model.add_lookup_parameters((num_languages, config.lang_emb_size)) + input_size += config.lang_emb_size + + label_list = ['B', 'I', 'E', 'S', 'X', 'BM', 'IM', 'EM', 'SM', 'T', 'U', 'UM'] + self.label_list = [] + self.label2int = {} + for upos in self.encodings.upos2int: + for label in label_list: + key = label + '_' + upos + self.label2int[key] = len(self.label2int) + self.label_list.append(key) + + # self.label2int = {'B': 0, 'I': 1, 'E': 2, 'S': 3, 'X': 4, 'BM': 5, 'IM': 6, 'EM': 7, 'SM': 8, 'T': 9, 'U': 10, + # 'UM': 11} + # self.label_list = ['B', 'I', 'E', 'S', 'X', 'BM', 'IM', 'EM', 'SM', 'T', 'U', 'UM'] + + self.lstm_fw = [] + self.lstm_bw = [] + + for layer_size in self.config.lstm_layers: + self.lstm_fw.append(dy.LSTMBuilder(1, input_size, layer_size, self.model)) + self.lstm_bw.append(dy.LSTMBuilder(1, input_size, layer_size, self.model)) + input_size = layer_size * 2 + self.config.lang_emb_size + + self.crf_decoder = CRFDecoder(self.model, input_size, 100, len(self.label2int)) + + def _make_input(self, seqs): + chars = [] + tags = [] + + for seq in seqs: + for entry in seq: + upos = '' + for char_idx in range(len(entry.word)): + chars.append(entry.word[char_idx]) + upos = entry.upos + if len(entry.word) == 1: + tags.append('S') + elif char_idx == 0: + tags.append('B') + elif char_idx == len(entry.word) - 1: + tags.append('E') + else: + tags.append('I') + if entry.is_compound_entry: + tags[-1] = tags[-1] + 'M' + + tags[-1] = tags[-1] + '_' + upos + if "spaceafter=no" not in entry.space_after.lower(): + chars.append(' ') + tags.append('X_' + upos) + delta = -1 + while tags[delta].startswith('X'): + delta -= 1 # it should never crash if dataset is ok + + if tags[delta].startswith('S'): + tags[delta] = 'T_' + upos + else: + append_m = '' + if tags[delta][1] == 'M': + append_m = 'M' + tags[delta] = 'U' + append_m + '_' + upos + return chars, tags + + def learn(self, conll_sequences, lang_id=0): + dy.renew_cg() + chars, tags = self._make_input(conll_sequences) + # for char, tag in zip(chars, tags): + # print(char + "-" + tag) + # + # for seq in conll_sequences: + # for entry in seq: + # print(entry.word + "\t" + entry.space_after) + # sys.exit(0) + embs = self._forward(chars, lang_id=lang_id) + tgt_tags = [self.label2int[tag] for tag in tags] + loss = self.crf_decoder.learn(embs, tgt_tags) + l_val = loss.value() + loss.backward() + self.trainer.update() + return l_val + + def tokenize(self, raw_text, lang_id=0): + # make sequences of approx 2000-4000 chars + BATCH_SIZE = 2000 + start = 0 + seqs = [] + seq = [] + word_index = 1 + word = '' + + while start < len(raw_text): + dy.renew_cg() + stop = start + BATCH_SIZE + if len(raw_text) - stop < BATCH_SIZE: + stop = len(raw_text) + + chars = [c for c in raw_text[start:stop]] + embs = self._forward(chars, lang_id=lang_id) + tags = self.crf_decoder.tag(embs) + + tmp_tags = [self.label_list[tag] for tag in tags] + tags = tmp_tags + + for index, char, tag in zip(range(len(tags)), chars, tags): + if not tag.startswith('X'): + word += char + if not tag.startswith('B') and not tag.startswith('X') and not tag.startswith( + 'I') and word.strip() != '': + entry = ConllEntry(word_index, word, '_', '_', '_', '_', word_index - 1, '_', '_', '_') + seq.append(entry) + word_index += 1 + word = '' + if tag.startswith('T') or tag.startswith('U'): + seqs.append(seq) + seq = [] + word_index = 1 + start += len(chars) + + if word.strip() != '': + entry = ConllEntry(word_index, word, '_', '_', '_', '_', word_index - 1, '_', '_', '_') + seq.append(entry) + if len(seq) != 0: + seqs.append(seq) + + return seqs + + def _forward(self, chars, lang_id=0): + inp = [] + + lang_emb = self.lang_lookup[lang_id] + + for char in chars: + case_emb = self.case_lookup[0] + if char.lower() == char.upper(): + case_emb = self.case_lookup[1] + elif char.lower() != char: + case_emb = self.case_lookup[2] + if char.lower() in self.encodings.char2int: + char_emb = self.char_lookup[self.encodings.char2int[char.lower()]] + + else: + char_emb = self.char_lookup[self.encodings.char2int['']] + + inp.append(dy.concatenate([case_emb, char_emb, lang_emb])) + + for l_fw, l_bw in zip(self.lstm_fw, self.lstm_bw): + x_fw = l_fw.initial_state().transduce(inp) + x_bw = l_fw.initial_state().transduce(inp) + inp = [dy.concatenate([fw, bw, lang_emb]) for fw, bw in zip(x_fw, list(reversed(x_bw)))] + + return inp + + def save(self, filename): + self.model.save(filename) + + def load(self, filename): + self.model.populate(filename) class TieredTokenizer: @@ -213,9 +685,11 @@ def _predict_tok(self, seq, y_gold=None, runtime=False): for index in range(len(seq)): word += seq[index] aux_softmax_output_prev.append( - dy.softmax(self.TOK_softmax_prev_w.expr(update=True) * fw_out[index] + self.TOK_softmax_prev_b.expr(update=True))) + dy.softmax(self.TOK_softmax_prev_w.expr(update=True) * fw_out[index] + self.TOK_softmax_prev_b.expr( + update=True))) aux_softmax_output_peek.append( - dy.softmax(self.TOK_softmax_peek_w.expr(update=True) * bw_out[index] + self.TOK_softmax_peek_b.expr(update=True))) + dy.softmax(self.TOK_softmax_peek_w.expr(update=True) * bw_out[index] + self.TOK_softmax_peek_b.expr( + update=True))) word_state = word_is_unknown peek_emb, found = self.word_embeddings.get_word_embeddings(word.strip()) @@ -238,7 +712,8 @@ def _predict_tok(self, seq, y_gold=None, runtime=False): if not runtime: hidden = dy.dropout(hidden, dropout) - softmax_output.append(dy.softmax(self.TOK_softmax_w.expr(update=True) * hidden + self.TOK_softmax_b.expr(update=True))) + softmax_output.append( + dy.softmax(self.TOK_softmax_w.expr(update=True) * hidden + self.TOK_softmax_b.expr(update=True))) must_split = False if not runtime: if y_gold[index] == "S" or y_gold[index] == "SX": @@ -466,18 +941,21 @@ def _predict_ss(self, seq, runtime=True): peek_out = self.SS_peek_lstm.initial_state().transduce(reversed(peek_chars))[-1] aux_softmax_output_peek.append( - dy.softmax(self.SS_aux_softmax_peek_w.expr(update=True) * peek_out + self.SS_aux_softmax_peek_b.expr(update=True))) + dy.softmax(self.SS_aux_softmax_peek_w.expr(update=True) * peek_out + self.SS_aux_softmax_peek_b.expr( + update=True))) lstm_fw = lstm_fw.add_input(x_list[cIndex]) lstm_out = lstm_fw.output() aux_softmax_output_prev.append( - dy.softmax(self.SS_aux_softmax_prev_w.expr(update=True) * lstm_out + self.SS_aux_softmax_prev_b.expr(update=True))) + dy.softmax(self.SS_aux_softmax_prev_w.expr(update=True) * lstm_out + self.SS_aux_softmax_prev_b.expr( + update=True))) hidden = dy.concatenate([lstm_out, peek_out]) for w, b, dropout in zip(self.SS_mlp_w, self.SS_mlp_b, self.config.ss_mlp_dropouts): hidden = dy.tanh(w.expr(update=True) * hidden + b.expr(update=True)) if not runtime: hidden = dy.dropout(hidden, dropout) - softmax_output.append(dy.softmax(self.SS_mlp_softmax_w.expr(update=True) * hidden + self.SS_mlp_softmax_b.expr(update=True))) + softmax_output.append( + dy.softmax(self.SS_mlp_softmax_w.expr(update=True) * hidden + self.SS_mlp_softmax_b.expr(update=True))) return softmax_output, aux_softmax_output_peek, aux_softmax_output_prev @@ -752,9 +1230,12 @@ def _predict(self, X, y=None): # X is a list of symbols, y is a list of labels next_chars = next_chars[-1] # self._attend(next_chars, lstm1_forward) softmax_aux_peek.append( - dy.softmax(self.aux_softmax_char_peek_w.expr(update=True) * next_chars + self.aux_softmax_char_peek_b.expr(update=True))) + dy.softmax( + self.aux_softmax_char_peek_w.expr(update=True) * next_chars + self.aux_softmax_char_peek_b.expr( + update=True))) softmax_aux_hist.append(dy.softmax( - self.aux_softmax_char_hist_w.expr(update=True) * encoder_char_output + self.aux_softmax_char_hist_b.expr(update=True))) + self.aux_softmax_char_hist_w.expr( + update=True) * encoder_char_output + self.aux_softmax_char_hist_b.expr(update=True))) # dropout at feature-set level: # if runtime: @@ -764,7 +1245,8 @@ def _predict(self, X, y=None): # X is a list of symbols, y is a list of labels if not runtime: decoder_input = dy.dropout(decoder_input, self.config.dropout_rate) - decoder_hidden = dy.tanh(self.decoder_hiddenW.expr(update=True) * decoder_input + self.decoder_hiddenB.expr(update=True)) + decoder_hidden = dy.tanh( + self.decoder_hiddenW.expr(update=True) * decoder_input + self.decoder_hiddenB.expr(update=True)) if not runtime: decoder_hidden = dy.dropout(decoder_hidden, self.config.dropout_rate) diff --git a/cube/generic_networks/translators.py b/_cube/generic_networks/translators.py similarity index 100% rename from cube/generic_networks/translators.py rename to _cube/generic_networks/translators.py diff --git a/cube/generic_networks/utils.py b/_cube/generic_networks/utils.py similarity index 100% rename from cube/generic_networks/utils.py rename to _cube/generic_networks/utils.py diff --git a/cube/generic_networks/wrappers.py b/_cube/generic_networks/wrappers.py similarity index 100% rename from cube/generic_networks/wrappers.py rename to _cube/generic_networks/wrappers.py diff --git a/tests/__init__.py b/_cube/graph/__init__.py similarity index 100% rename from tests/__init__.py rename to _cube/graph/__init__.py diff --git a/cube/graph/decoders.py b/_cube/graph/decoders.py similarity index 70% rename from cube/graph/decoders.py rename to _cube/graph/decoders.py index deb2c43bb..b260d5131 100644 --- a/cube/graph/decoders.py +++ b/_cube/graph/decoders.py @@ -65,16 +65,22 @@ def _make_ordered_list(self, tree, nWords): lst[tail] = head return lst[1:] - def decode(self, norm_score): - nWords = len(norm_score) - g = [] - for iSrc in range(1, nWords): - for iDst in range(1, nWords): - if iDst != iSrc: - a = Arc(iSrc, norm_score[iSrc][iDst].value(), iDst) - g.append(a) - tree = self._greedy_tree(g) - best_tree = self._make_ordered_list(tree, nWords) - return best_tree - - + def decode(self, score, lens): + best_tree_list = [] + for ii in range(score.shape[0]): + # norm_score = score[ii, :lens[ii], :lens[ii]] + norm_score = np.zeros((lens[ii] + 1, lens[ii] + 1)) + for wii in range(lens[ii]): + for wjj in range(lens[ii] + 1): + norm_score[wii + 1, wjj] = score[ii, wii, wjj] + nWords = norm_score.shape[0] # len(norm_score) + g = [] + for iSrc in range(1, nWords): + for iDst in range(1, nWords): + if iDst != iSrc: + a = Arc(iSrc, norm_score[iSrc][iDst], iDst) + g.append(a) + tree = self._greedy_tree(g) + best_tree = self._make_ordered_list(tree, nWords) + best_tree_list.append(best_tree) + return best_tree_list diff --git a/examples/.KEEP b/_cube/io_utils/__init__.py similarity index 100% rename from examples/.KEEP rename to _cube/io_utils/__init__.py diff --git a/_cube/io_utils/config.py b/_cube/io_utils/config.py new file mode 100644 index 000000000..0f35fb1a4 --- /dev/null +++ b/_cube/io_utils/config.py @@ -0,0 +1,293 @@ +# +# Authors: Tiberiu Boros, Stefan Daniel Dumitrescu +# +# Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import ast +from builtins import object, super +from cube.misc.misc import fopen +import collections + +if sys.version_info[0] == 2: + import ConfigParser +else: + import configparser + + +class Config(object): + """Generic base class that implements load/save utilities.""" + + def __init__(self): + """Call to set config object name.""" + self.__config__ = self.__class__.__name__ + + def _auto_cast(self, s): + """Autocasts string s to its original type.""" + try: + return ast.literal_eval(s) + except: + return s + + def save(self, filename): + """Save configuration to file.""" + sorted_dict = collections.OrderedDict(sorted(self.__dict__.items())) # sort dictionary + if sys.version_info[0] == 2: + config = ConfigParser.ConfigParser() + else: + config = configparser.ConfigParser() + config.add_section(self.__config__) # write header + if sys.version_info[0] == 2: + items = sorted_dict.iteritems() + else: + items = sorted_dict.items() + for k, v in items: # for python3 use .items() + if not k.startswith("_"): # write only non-private properties + if isinstance(v, float): # if we are dealing with a float + str_v = str(v) + if "e" not in str_v and "." not in str_v: # stop possible confusion with an int by appending a ".0" + v = str_v + ".0" + v = str(v) + config.set(self.__config__, k, v) + with fopen(filename, 'w') as cfgfile: + config.write(cfgfile) + + def load(self, filename): + """Load configuration from file.""" + if sys.version_info[0] == 2: + config = ConfigParser.ConfigParser() + else: + config = configparser.ConfigParser() + config.read(filename) + # check to see if the config file has the appropriate section + if not config.has_section(self.__config__): + sys.stderr.write( + "ERROR: File \"" + filename + "\" is not a valid configuration file for the selected task: Missing section [" + self.__config__ + "]!\n") + sys.exit(1) + for k, v in config.items(self.__config__): + self.__dict__[k] = self._auto_cast(v) + + +class TokenizerConfig(Config): + def __init__(self, filename=None, verbose=False): + super().__init__() + self.lstm_layers = [200, 200] + self.lang_emb_size = 100 + self.char_emb_size = 100 + + if filename is None: + if verbose: + sys.stdout.write("No configuration file supplied. Using default values.\n") + else: + if verbose: + sys.stdout.write("Reading configuration file " + filename + " \n") + self.load(filename) + + self._valid = True + + +class TaggerConfig(Config): + def __init__(self, filename=None, verbose=False): + super().__init__() + self.layers = [200, 200] + self.layer_dropouts = [0.5, 0.5] + self.aux_softmax_layer = 1 + self._valid = True + self.input_dropout_prob = 0.33 + self.presoftmax_mlp_layers = [500] + self.presoftmax_mlp_dropouts = [0.5] + self.input_size = 100 + self.language_embedding_size = 100 + + if filename is None: + if verbose: + sys.stdout.write("No configuration file supplied. Using default values.\n") + else: + if verbose: + sys.stdout.write("Reading configuration file " + filename + " \n") + self.load(filename) + + if verbose: + print("INPUT SIZE:", self.input_size) + print("LAYERS:", self.layers) + print("LAYER DROPOUTS:", self.layer_dropouts) + print("AUX SOFTMAX POSITION:", self.aux_softmax_layer) + print("INPUT DROPOUT PROB:", self.input_dropout_prob) + print("PRESOFTMAX MLP LAYERS:", self.presoftmax_mlp_layers) + print("PRESOFTMAX MLP DROPOUT:", self.presoftmax_mlp_dropouts) + + if self.aux_softmax_layer > len(self.layers) - 1 or self.aux_softmax_layer == 0: + print( + "Configuration error: aux softmax layer must be placed after the first layer and before the final one.") + self._valid = False + + +class ParserConfig(Config): + def __init__(self, filename=None, verbose=False): + super().__init__() + self.layers = [300, 300, 200, 200, 200] + self.layer_dropouts = [0.33, 0.33, 0.33, 0.33, 0.33] + self.aux_softmax_layer = 2 + self._valid = True + self.input_dropout_prob = 0.33 + self.arc_proj_size = 100 + self.label_proj_size = 400 + self.presoftmax_mlp_dropout = 0.33 + self.predict_morphology = True + self.input_embeddings_size = 100 + + if filename is None: + if verbose: + sys.stdout.write("No configuration file supplied. Using default values.\n") + else: + if verbose: + sys.stdout.write("Reading configuration file " + filename + " \n") + self.load(filename) + + if verbose: + print("LAYERS:", self.layers) + print("LAYER DROPOUTS:", self.layer_dropouts) + print("AUX SOFTMAX POSITION:", self.aux_softmax_layer) + print("INPUT DROPOUT PROB:", self.input_dropout_prob) + print("ARC PROJECTION SIZE:", self.arc_proj_size) + print("LABEL PROJECTION SIZE:", self.label_proj_size) + print("PRESOFTMAX MLP DROPOUT:", self.presoftmax_mlp_dropout) + print("JOINTLY PARSE AND PREDICT MORPHOLOGY:", self.predict_morphology) + print("USE MORPHOLOGY AS INPUT:", self.use_morphology) + print("INPUT EMBEDDINGS SIZE:", self.input_embeddings_size) + + if self.aux_softmax_layer > len(self.layers) - 1 or self.aux_softmax_layer == 0: + print( + "Configuration error: aux softmax layer must be placed after the first layer and before the final one.") + self._valid = False + + +class LemmatizerConfig(Config): + def __init__(self, filename=None, verbose=False): + super().__init__() + self.rnn_size = 200 + self.rnn_layers = 2 + self.char_embeddings = 100 + self.char_rnn_size = 200 + self.char_rnn_layers = 2 + self.tag_embeddings_size = 100 + + if filename is None: + if verbose: + sys.stdout.write("No configuration file supplied. Using default values.\n") + else: + if verbose: + sys.stdout.write("Reading configuration file " + filename + " \n") + self.load(filename) + + +class NMTConfig(Config): + def __init__(self, filename=None): + super().__init__() + self.encoder_layers = [300, 300] + self.encoder_layer_dropouts = [0.33, 0.33] + self.decoder_layers = 2 + self.decoder_size = 300 + self.decoder_dropout = 0.33 + self.input_size = 100 + self.aux_we_layer_size = 100 + self.input_dropout_prob = 0.33 + + if filename is None: + sys.stdout.write("No configuration file supplied. Using default values.\n") + else: + sys.stdout.write("Reading configuration file " + filename + " \n") + self.load(filename) + + +class TieredTokenizerConfig(Config): + def __init__(self, filename=None, verbose=False): + super().__init__() + # sentece splitting + self.ss_char_embeddings_size = 100 + self.ss_char_peek_count = 5 + self.ss_mlp_layers = [100] + self.ss_mlp_dropouts = [0.33] + self.ss_lstm_size = 64 + self.ss_lstm_layers = 1 + self.ss_lstm_dropout = 0.33 + self.ss_peek_lstm_size = 64 + self.ss_peek_lstm_layers = 1 + self.ss_peek_lstm_dropout = 0.33 + # tokenization + self.tok_char_embeddings_size = 100 + self.tok_word_embeddings_size = 100 + self.tok_mlp_layers = [100] + self.tok_mlp_dropouts = [0.33] + self.tok_char_lstm_layers = 2 + self.tok_char_lstm_size = 200 + self.tok_char_lstm_dropout = 0.33 + self.tok_word_lstm_layers = 2 + self.tok_word_lstm_size = 200 + self.tok_word_lstm_dropout = 0.33 + self.tok_char_peek_lstm_layers = 2 + self.tok_char_peek_lstm_size = 200 + self.tok_char_peek_lstm_dropout = 0.33 + + if filename is None: + if verbose: + sys.stdout.write("No configuration file supplied. Using default values.\n") + else: + if verbose: + sys.stdout.write("Reading configuration file " + filename + " \n") + self.load(filename) + + self._valid = True + + +class CompoundWordConfig(Config): + def __init__(self, filename=None, verbose=False): + super().__init__() + self.character_embeddings_size = 100 + self.encoder_size = 200 + self.encoder_layers = 2 + self.decoder_size = 200 + self.decoder_layers = 2 + + if filename is None: + if verbose: + sys.stdout.write("No configuration file supplied. Using default values.\n") + else: + if verbose: + sys.stdout.write("Reading configuration file " + filename + " \n") + self.load(filename) + + +class GDBConfig(Config): + def __init__(self, filename=None, verbose=False): + super().__init__() + self.use_char_embeddings = True + self.char_rnn_layers = 2 + self.char_rnn_size = 100 + self.embeddings_size = 100 + self.arc_rnn_layers = [200, 200] + self.label_rnn_size = 100 + self.proj_size = 100 + + if filename is None: + if verbose: + sys.stdout.write("No configuration file supplied. Using default values.\n") + else: + if verbose: + sys.stdout.write("Reading configuration file " + filename + " \n") + self.load(filename) + + self._valid = True diff --git a/cube/io_utils/conll.py b/_cube/io_utils/conll.py similarity index 79% rename from cube/io_utils/conll.py rename to _cube/io_utils/conll.py index cfb74ce6c..f3e117530 100644 --- a/cube/io_utils/conll.py +++ b/_cube/io_utils/conll.py @@ -20,18 +20,32 @@ import io from cube.misc.misc import fopen + class Dataset: - def __init__(self, file=None): + def __init__(self, file=None, lang_id=0): if file is not None: sys.stdout.write("Reading " + file + "... ") sys.stdout.flush() with fopen(file, "r") as f: lines = f.readlines() - - self.sequences = self._make_sequences(lines) + + self.sequences = self._make_sequences(lines, lang_id=lang_id) sys.stdout.write("found " + str(len(self.sequences)) + " sequences\n") - def _make_sequences(self, lines): + self.sequences = [] + + def load_language(self, file, lang_id, ignore_compound=False): + sys.stdout.write("Reading " + file + "... ") + sys.stdout.flush() + with fopen(file, "r") as f: + lines = f.readlines() + + ns = self._make_sequences(lines, lang_id=lang_id, ignore_compound=ignore_compound) + for [seq, l_id] in ns: + self.sequences.append([seq, l_id]) + sys.stdout.write("found " + str(len(ns)) + " sequences\n") + + def _make_sequences(self, lines, lang_id=0, ignore_compound=False): sequences = [] in_sequence = False seq = [] @@ -42,12 +56,13 @@ def _make_sequences(self, lines): parts = line.split("\t") s = ConllEntry(parts[0], parts[1], parts[2], parts[3], parts[4], parts[5], parts[6], parts[7], parts[8], parts[9]) - seq.append(s) + if not ignore_compound or not s.is_compound_entry: + seq.append(s) in_sequence = True elif line == "": in_sequence = False if len(seq) > 0: - sequences.append(seq) + sequences.append([seq, lang_id]) seq = [] return sequences diff --git a/cube/io_utils/cupt.py b/_cube/io_utils/cupt.py similarity index 100% rename from cube/io_utils/cupt.py rename to _cube/io_utils/cupt.py diff --git a/cube/io_utils/embeddings.py b/_cube/io_utils/embeddings.py similarity index 100% rename from cube/io_utils/embeddings.py rename to _cube/io_utils/embeddings.py diff --git a/_cube/io_utils/encodings.py b/_cube/io_utils/encodings.py new file mode 100644 index 000000000..0dd4ec9ea --- /dev/null +++ b/_cube/io_utils/encodings.py @@ -0,0 +1,263 @@ +# +# Author: Tiberiu Boros +# +# Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import re +from cube.misc.misc import fopen + + +class Encodings(object): + + def __init__(self, verbose=True): + self.word_list = {} + self.hol_word_list = [] + self.char2int = {} + self.label2int = {} + self.labels = [] + self.word2int = {} + self.upos2int = {} + self.xpos2int = {} + self.attrs2int = {} + self.upos_list = [] + self.xpos_list = [] + self.attrs_list = [] + self.characters = [] + self.verbose = verbose + + def compute(self, train, dev, tag_type=None, word_cutoff=7, char_cutoff=5, CUPT_format=False): + if self.verbose: + sys.stdout.write("Computing encoding maps... ") + sys.stdout.flush() + + self.word2int[''] = 0 + self.hol_word_list.append('') + self.word2int[''] = 1 + self.hol_word_list.append('') + self.char2int[''] = 0 + self.char2int[''] = 1 + self.char2int[' '] = 2 + self.upos2int[''] = 0 + self.upos_list.append('') + self.xpos2int[''] = 0 + self.xpos_list.append('') + self.attrs2int[''] = 0 + self.attrs_list.append('') + self.upos2int[''] = 1 + self.upos_list.append('') + self.xpos2int[''] = 1 + self.xpos_list.append('') + self.attrs2int[''] = 1 + self.attrs_list.append('') + self.label2int[''] = 0 + self.labels.append('') + self.label2int[''] = 1 + self.labels.append('') + + self.characters.append("") + self.characters.append("") + self.characters.append(" ") + char_count = {} + word_count = {} + for [seq, lang_id] in train.sequences: + for entry in seq: + word = entry.word.lower() + if word not in word_count: + word_count[word] = 1 + else: + word_count[word] = word_count[word] + 1 + if word not in self.word_list: + self.word_list[word] = 0 # word is inside trainset + + uniword = entry.word.lower() + uniword = re.sub('\d', '0', uniword) + for i in range(len(uniword)): + char = uniword[i].lower() + if char not in char_count: + char_count[char] = 1 + else: + char_count[char] = char_count[char] + 1 + + # if char not in self.char2int: + # self.char2int[char] = len(self.char2int) + + label = entry.label + + if CUPT_format and tag_type == 'label': + if entry.label != "*": + labels = entry.label.split(';') + entry_labels = [label.split(':')[1] for label in labels if ':' in label] + for entry_label in entry_labels: + self.label2int.setdefault(entry_label, len(self.label2int)) + else: + if label not in self.label2int: + self.label2int[label] = len(self.label2int) + self.labels.append(label) + + # morphological encodings + if entry.upos not in self.upos2int: + self.upos2int[entry.upos] = len(self.upos2int) + self.upos_list.append(entry.upos) + if entry.xpos not in self.xpos2int: + self.xpos2int[entry.xpos] = len(self.xpos2int) + self.xpos_list.append(entry.xpos) + if entry.attrs not in self.attrs2int: + self.attrs2int[entry.attrs] = len(self.attrs2int) + self.attrs_list.append(entry.attrs) + + for [seq, lang_id] in dev.sequences: + for entry in seq: + word = entry.word.lower() + if word not in self.word_list: + self.word_list[word] = 1 # word is inside devset only + + for word in word_count: + if word_count[word] >= word_cutoff: + self.word2int[word] = len(self.word2int) + self.hol_word_list.append(word) + for char in char_count: + if char_count[char] >= char_cutoff and char not in self.char2int: + self.char2int[char] = len(self.char2int) + self.characters.append(char) + + # force add digits + for digit in range(10): + ds = str(digit) + if ds not in self.char2int: + self.char2int[ds] = len(self.char2int) + self.characters.append(ds) + if self.verbose: + sys.stdout.write("done\n") + + print("Unique words: " + str(len(self.word_list))) + print("Unique chars: " + str(len(self.char2int))) + print("Unique labels: " + str(len(self.label2int))) + print("Unique UPOS: " + str(len(self.upos2int))) + print("Unique XPOS: " + str(len(self.xpos2int))) + print("Unique ATTRS: " + str(len(self.attrs2int))) + print("Holistic word count: " + str(len(self.word2int))) + + def update_wordlist(self, dataset): + for seq in dataset.sequences: + for entry in seq: + word = entry.word.lower() + if word not in self.word_list: + self.word_list[word] = 2 # word is inside an auxiliarly set (probably test) + + def load(self, filename): + # We only read character2int, labels, holistic words and label2int here. word_list should be recomputed for every dataset (if deemed necessary) + with fopen(filename, "r") as f: + line = f.readline() + + num_labels = int(line.split(" ")[1]) + if self.verbose: + print("Loading labels " + str(num_labels)) + self.labels = [""] * num_labels + for _ in range(num_labels): + line = f.readline() + parts = line.split("\t") + key = parts[0] + value = int(parts[1]) + self.label2int[key] = value + self.labels[value] = key + + line = f.readline() + num_characters = int(line.split(" ")[1]) + self.characters = [""] * num_characters + if self.verbose: + print("Loading characters " + str(num_characters)) + for _ in range(num_characters): + line = f.readline() + parts = line.split("\t") + key = parts[0] + value = int(parts[1]) + self.char2int[key] = value + self.characters[value] = key + line = f.readline() + num_words = int(line.split(" ")[1]) + if self.verbose: + print("Loading words " + str(num_words)) + for _x in range(num_words): + line = f.readline() + parts = line.split("\t") + key = parts[0] + value = int(parts[1]) + self.word2int[key] = value + + # morphological attributes + line = f.readline() + num_labels = int(line.split(" ")[1]) + if self.verbose: + print("Loading upos " + str(num_labels)) + self.upos_list = [""] * num_labels + for _ in range(num_labels): + line = f.readline() + parts = line.split("\t") + key = parts[0] + value = int(parts[1]) + self.upos2int[key] = value + self.upos_list[value] = key + + line = f.readline() + num_labels = int(line.split(" ")[1]) + self.xpos_list = [""] * num_labels + if self.verbose: + print("Loading xpos " + str(num_labels)) + for _ in range(num_labels): + line = f.readline() + parts = line.split("\t") + key = parts[0] + value = int(parts[1]) + self.xpos2int[key] = value + self.xpos_list[value] = key + + line = f.readline() + num_labels = int(line.split(" ")[1]) + self.attrs_list = [""] * num_labels + if self.verbose: + print("Loading attrs " + str(num_labels)) + for _ in range(num_labels): + line = f.readline() + parts = line.split("\t") + key = parts[0] + value = int(parts[1]) + self.attrs2int[key] = value + self.attrs_list[value] = key + f.close() + + def save(self, filename): + f = fopen(filename, "w") + f.write("LABELS " + str(len(self.label2int)) + "\n") + for label in self.label2int: + f.write(str(label) + "\t" + str(self.label2int[label]) + "\n") + f.write("CHARACTERS " + str(len(self.char2int)) + "\n") + for character in self.char2int: + f.write(character + "\t" + str(self.char2int[character]) + "\n") + f.write("WORDS " + str(len(self.word2int)) + "\n") + for word in self.word2int: + f.write(word + "\t" + str(self.word2int[word]) + "\n") + + f.write("UPOS " + str(len(self.upos2int)) + "\n") + for label in self.upos2int: + f.write(label + "\t" + str(self.upos2int[label]) + "\n") + f.write("XPOS " + str(len(self.xpos2int)) + "\n") + for label in self.xpos2int: + f.write(label + "\t" + str(self.xpos2int[label]) + "\n") + f.write("ATTRS " + str(len(self.attrs2int)) + "\n") + for label in self.attrs2int: + f.write(label + "\t" + str(self.attrs2int[label]) + "\n") + f.close() diff --git a/cube/io_utils/model_store.py b/_cube/io_utils/model_store.py similarity index 100% rename from cube/io_utils/model_store.py rename to _cube/io_utils/model_store.py diff --git a/cube/io_utils/mt.py b/_cube/io_utils/mt.py similarity index 100% rename from cube/io_utils/mt.py rename to _cube/io_utils/mt.py diff --git a/cube/io_utils/trainers.py b/_cube/io_utils/trainers.py similarity index 86% rename from cube/io_utils/trainers.py rename to _cube/io_utils/trainers.py index 69b2ae5ec..aeaf7c72c 100644 --- a/cube/io_utils/trainers.py +++ b/_cube/io_utils/trainers.py @@ -17,7 +17,7 @@ # import sys -from misc.misc import fopen +from cube.misc.misc import fopen sys.path.insert(0, '../') from random import shuffle @@ -517,14 +517,15 @@ def start_training(self, output_base, batch_size=0): current_batch_size = 0 self.tagger.start_batch() for iSeq in range(len(self.trainset.sequences)): - seq = self.trainset.sequences[iSeq] + seq = self.trainset.sequences[iSeq][0] + lang_id = self.trainset.sequences[iSeq][1] proc = int((iSeq + 1) * 100 / len(self.trainset.sequences)) if proc % 5 == 0 and proc != last_proc: last_proc = proc sys.stdout.write(" " + str(proc)) sys.stdout.flush() - self.tagger.learn(seq) + self.tagger.learn(seq, lang_id=lang_id) current_batch_size += len(seq) if current_batch_size > batch_size: total_loss += self.tagger.end_batch() @@ -617,7 +618,8 @@ def eval(self, dataset): total = 0 for iSeq in range(len(dataset.sequences)): - seq = dataset.sequences[iSeq] + seq = dataset.sequences[iSeq][0] + lang_id = dataset.sequences[iSeq][1] proc = int((iSeq + 1) * 100 / len(dataset.sequences)) if proc % 5 == 0 and proc != last_proc: @@ -625,7 +627,7 @@ def eval(self, dataset): sys.stdout.write(" " + str(proc)) sys.stdout.flush() - pred_tags = self.tagger.tag(seq) + pred_tags = self.tagger.tag(seq, lang_id=lang_id) for entry, pred_tag in zip(seq, pred_tags): total += 1 @@ -660,8 +662,8 @@ def start_training(self, output_base, batch_size=100): path = output_base + ".conf" sys.stdout.write("Storing config in " + path + "\n") self.parser.config.save(path) - sys.stdout.write("\tevaluating on devset...") - sys.stdout.flush() + # sys.stdout.write("\tevaluating on devset...") + # sys.stdout.flush() # dev_uas, dev_las, dev_upos, dev_xpos, dev_attrs, dev_lemma = self.eval(self.devset) # sys.stdout.write(" UAS=" + str(dev_uas) + " LAS=" + str(dev_las) + " UPOS=" + str(dev_upos) + " XPOS=" + str( # dev_xpos) + " ATTRS=" + str(dev_attrs) + " LEMMA=" + str(dev_lemma) + "\n") @@ -706,14 +708,15 @@ def start_training(self, output_base, batch_size=100): start_time = time.time() for iSeq in range(len(self.trainset.sequences)): - seq = self.trainset.sequences[iSeq] + seq = self.trainset.sequences[iSeq][0] + lang_id = self.trainset.sequences[iSeq][1] proc = int((iSeq + 1) * 100 / len(self.trainset.sequences)) if proc % 5 == 0 and proc != last_proc: last_proc = proc sys.stdout.write(" " + str(proc)) sys.stdout.flush() - self.parser.learn(seq) + self.parser.learn(seq, lang_id) current_batch_size += len(seq) if current_batch_size >= batch_size: total_loss += self.parser.end_batch() @@ -795,7 +798,8 @@ def eval(self, dataset): total = 0 for iSeq in range(len(dataset.sequences)): - seq = dataset.sequences[iSeq] + seq = dataset.sequences[iSeq][0] + lang_id = dataset.sequences[iSeq][1] # remove compound words tmp = [] for entry in seq: @@ -808,7 +812,7 @@ def eval(self, dataset): sys.stdout.write(" " + str(proc)) sys.stdout.flush() - predicted = self.parser.tag(seq) + predicted = self.parser.tag(seq, lang_id) for entry, pred in zip(seq, predicted): total += 1 @@ -836,6 +840,193 @@ def eval(self, dataset): class TokenizerTrainer: + def __init__(self, tokenizer, encodings, patience, trainset, devset): + self.tokenizer = tokenizer + self.encodings = encodings + self.patience = patience + self.trainset = trainset + self.devset = devset + + self.train_buckets = {} + self.dev_buckets = {} + + self._bucket_languages() + + def _bucket_languages(self): + max_lang_id = 0 + for seq, lang_id in self.trainset.sequences: + if lang_id > max_lang_id: + max_lang_id = lang_id + + for lang_id in range(max_lang_id + 1): + self.train_buckets[lang_id] = [] + self.dev_buckets[lang_id] = [] + + for seq, lang_id in self.trainset.sequences: + self.train_buckets[lang_id].append(seq) + + for seq, lang_id in self.devset.sequences: + self.dev_buckets[lang_id].append(seq) + + def _make_input(self, seqs): + chars = [] + subtokens_left = 0 + for seq in seqs: + for entry in seq: + if subtokens_left == 0: + for char_idx in range(len(entry.word)): + chars.append(entry.word[char_idx]) + if "spaceafter=no" not in entry.space_after.lower(): + chars.append(' ') + else: + subtokens_left -= 1 + if entry.is_compound_entry: + parts = entry.index.split('-') + subtokens_left = int(parts[1]) - int(parts[0]) + 1 + return ''.join(chars) + + def eval(self, output_base): + # todo: implement multilanguage training + + pred_file = open(output_base + "-temporary.conllu", 'w') + gold_file = open(output_base + "-gold.conllu", 'w') + for lang_id in self.dev_buckets: + text = self._make_input(self.dev_buckets[lang_id]) + seqs = self.tokenizer.tokenize(text, lang_id=lang_id) + # with fopen(output_base + "-temporary.conllu", 'w') as file: + for sentence in seqs: + # print ("Sentence has entries: "+str(len(sentence))) + for entry in sentence: + line = str( + entry.index) + "\t" + entry.word + "\t" + entry.lemma + "\t" + entry.upos + "\t" + entry.xpos + "\t" + entry.attrs + "\t" + str( + entry.head) + "\t" + entry.label + "\t" + entry.deps + "\t" + entry.space_after + "\n" + pred_file.write(line) + + pred_file.write("\n") + + for sentence in self.dev_buckets[lang_id]: + # print ("Sentence has entries: "+str(len(sentence))) + for entry in sentence: + line = str( + entry.index) + "\t" + entry.word + "\t" + entry.lemma + "\t" + entry.upos + "\t" + entry.xpos + "\t" + entry.attrs + "\t" + str( + entry.head) + "\t" + entry.label + "\t" + entry.deps + "\t" + entry.space_after + "\n" + gold_file.write(line) + + gold_file.write("\n") + + # run eval script + + pred_file.close() + gold_file.close() + metrics = conll_eval(output_base + "-temporary.conllu", output_base + "-gold.conllu") + + # return metrics["Tokens"].f1 * 100., metrics["Sentences"].f1 * 100. + if metrics is None: + return 0, 0, 0 + else: + return metrics["Sentences"].f1, metrics["Tokens"].f1, metrics["Words"].f1 + # return 0, 0, 0 + + def _get_num_chars(self, seq): + seq = seq[0] + num_chars = 0 + for entry in seq: + num_chars += len(entry.word) + if "spaceafter=no" not in entry.space_after.lower(): + num_chars += 1 + return num_chars + + def start_training(self, output_base, batch_size=1000): + best_sent = 0 + best_token = 0 + best_word = 0 + patience_left = self.patience + total_loss = 0 + total_chars = 0 + path = output_base + ".encodings" + sys.stdout.write("Storing encodings in " + path + "\n") + self.encodings.save(path) + path = output_base + ".conf" + sys.stdout.write("Storing config in " + path + "\n") + self.tokenizer.config.save(path) + + # print(self.eval("tmp")) + + # toto: multilnaguage training + epoch = 1 + while patience_left > 0: + sys.stdout.write('Starting epoch ' + str(epoch) + '\n') + epoch += 1 + patience_left -= 1 + last_proc = 0 + sys.stdout.write('\tShuffling training data\n') + random.shuffle(self.trainset.sequences) + sys.stdout.write('\ttraining...') + sys.stdout.flush() + + chars_in_batch = 0 + batched_seqs = [] + start = time.time() + total_seqs = len(self.trainset.sequences) + curr_seqs = 0 + for lang_id in self.train_buckets: + for idx in range(len(self.train_buckets[lang_id])): + curr_seqs += 1 + curr_proc = curr_seqs * 100 // total_seqs + # print(curr_proc) + if curr_proc % 5 == 0 and last_proc != curr_proc: + while last_proc < curr_proc: + last_proc += 5 + sys.stdout.write(' ' + str(last_proc)) + sys.stdout.flush() + + batched_seqs.append(self.train_buckets[lang_id][idx]) + chars_in_batch += self._get_num_chars(self.trainset.sequences[idx]) + if chars_in_batch > batch_size: + loss = self.tokenizer.learn(batched_seqs, lang_id=lang_id) + total_loss += loss + total_chars += chars_in_batch + chars_in_batch = 0 + batched_seqs = [] + + if chars_in_batch != 0: + loss = self.tokenizer.learn(batched_seqs, lang_id=lang_id) + total_loss += loss + total_chars += chars_in_batch + chars_in_batch = 0 + batched_seqs = [] + + stop = time.time() + + sys.stdout.write(' loss=' + str(total_loss / total_chars) + ' execution time=' + str(stop - start) + '\n') + + sys.stdout.write('\tevaluating...') + f_sent, f_token, f_word = self.eval(output_base) + sys.stdout.write(' sent=' + str(f_sent) + ' tok=' + str(f_token) + ' words=' + str(f_word) + '\n') + path = output_base + ".last" + sys.stdout.write('\t\tStoring ' + path + '\n') + self.tokenizer.save(path) + + if f_sent > best_sent: + best_sent = f_sent + patience_left = self.patience + sys.stdout.write('\t\tStoring ' + output_base + '-ss.best\n') + self.tokenizer.save(output_base + '-ss.best') + + if f_token > best_token: + best_token = f_token + patience_left = self.patience + sys.stdout.write('\t\tStoring ' + output_base + '-tok.best\n') + self.tokenizer.save(output_base + '-tok.best') + + if f_word > best_word: + best_word = f_word + patience_left = self.patience + sys.stdout.write('\t\tStoring ' + output_base + '-words.best\n') + self.tokenizer.save(output_base + '-words.best') + + +class TieredTokenizerTrainer: def __init__(self, tokenizer, encodings, patience, trainset, devset=None, testset=None, raw_train_file=None, raw_dev_file=None, raw_test_file=None, gold_train_file=None, gold_dev_file=None, gold_test_file=None): self.tokenizer = tokenizer @@ -970,7 +1161,7 @@ def start_training(self, output_base, batch_size=0): # convert Dataset to list of chars X_train_raw, y_train_raw, space_after_end_of_sentence = self._create_Xy_sequences(self.trainset) if not space_after_end_of_sentence: - print ("\t NOTE: Training sentences do not end with a space after EOS.") + print("\t NOTE: Training sentences do not end with a space after EOS.") # X_dev_raw, y_dev_raw, _ = self._create_Xy_sequences(self.devset) while itt_no_improve > 0: diff --git a/cube/main.py b/_cube/main.py similarity index 100% rename from cube/main.py rename to _cube/main.py diff --git a/_cube/misc/__init__.py b/_cube/misc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cube/misc/conll17_ud_eval.patch b/_cube/misc/conll17_ud_eval.patch similarity index 100% rename from cube/misc/conll17_ud_eval.patch rename to _cube/misc/conll17_ud_eval.patch diff --git a/cube/misc/conll17_ud_eval_wrapper.py b/_cube/misc/conll17_ud_eval_wrapper.py similarity index 100% rename from cube/misc/conll17_ud_eval_wrapper.py rename to _cube/misc/conll17_ud_eval_wrapper.py diff --git a/cube/misc/conll18_ud_eval.patch b/_cube/misc/conll18_ud_eval.patch similarity index 100% rename from cube/misc/conll18_ud_eval.patch rename to _cube/misc/conll18_ud_eval.patch diff --git a/cube/misc/conll18_ud_eval.py b/_cube/misc/conll18_ud_eval.py similarity index 100% rename from cube/misc/conll18_ud_eval.py rename to _cube/misc/conll18_ud_eval.py diff --git a/cube/misc/conll18_ud_eval.py.orig b/_cube/misc/conll18_ud_eval.py.orig similarity index 100% rename from cube/misc/conll18_ud_eval.py.orig rename to _cube/misc/conll18_ud_eval.py.orig diff --git a/cube/misc/conll18_ud_eval_wrapper.py b/_cube/misc/conll18_ud_eval_wrapper.py similarity index 61% rename from cube/misc/conll18_ud_eval_wrapper.py rename to _cube/misc/conll18_ud_eval_wrapper.py index 40e5f613c..9de05d51a 100644 --- a/cube/misc/conll18_ud_eval_wrapper.py +++ b/_cube/misc/conll18_ud_eval_wrapper.py @@ -16,13 +16,16 @@ # limitations under the License. # -from misc.conll18_ud_eval import load_conllu_file, evaluate +from _cube.misc.conll18_ud_eval import load_conllu_file, evaluate -#metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"] -#example usage: metrics_test = conll_eval(system,gold) + +# metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"] +# example usage: metrics_test = conll_eval(system,gold) # test_tok_f1, test_ss_f1 = metrics_test["Tokens"].f1*100., metrics_test["Sentences"].f1*100. -def conll_eval(system_file, gold_file): - gold_ud = load_conllu_file(gold_file) - system_ud = load_conllu_file(system_file) - return evaluate(gold_ud, system_ud) - \ No newline at end of file +def conll_eval(system_file, gold_file): + try: + gold_ud = load_conllu_file(gold_file) + system_ud = load_conllu_file(system_file) + return evaluate(gold_ud, system_ud) + except: + return None diff --git a/cube/misc/misc.py b/_cube/misc/misc.py similarity index 100% rename from cube/misc/misc.py rename to _cube/misc/misc.py diff --git a/_cube/webserver.py b/_cube/webserver.py new file mode 100644 index 000000000..424b71912 --- /dev/null +++ b/_cube/webserver.py @@ -0,0 +1,126 @@ +# +# Author: Tiberiu Boros +# +# Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys + +sys.path.append("..") +import optparse +from flask import Flask +from flask import Response +from cube.api import Cube + +app = Flask(__name__) +singletonServer = False + +lang2cube = {} + + +@app.route('/') +def index(): + return "NLP-Cube server running. Use /help to learn more." + + +@app.route('/nlp', methods=['GET', 'POST']) +def nlp(): + from flask import request + if request.args.get('text'): + query = request.args.get('text') + else: + query = request.form.get('text') + + if request.args.get('lang'): + lang = request.args.get('lang') + else: + lang = request.form.get('lang') + + if request.args.get('format'): + format = request.args.get('format') + else: + format = request.form.get('format') + + if not format: + format = 'CONLL' + + if format != 'CONLL' and format != 'JSON': + return Response("Allowed values for 'format' are CONLL or JSON", mimetype='text/plain', + status=500) + if not query or not lang: + return Response("You need to specify the language (lang) and text (text) parameters", mimetype='text/plain', + status=500) + + if lang not in lang2cube: + return Response("This language has not beed preloaded during server startup", mimetype='text/plain', + status=500) + + thecube = lang2cube[lang] + result = thecube(query) + if format == 'CONLL': + text = "" + for seq in result: + for entry in seq: + text += str( + entry.index) + "\t" + entry.word + "\t" + entry.lemma + "\t" + entry.upos + "\t" + entry.xpos + "\t" + entry.attrs + "\t" + str( + entry.head) + "\t" + entry.label + "\t" + entry.deps + "\t" + entry.space_after + "\n" + text += "\n" + return Response(text, mimetype='text/plain', + status=200) + else: + import json + new_seqs = [] + for seq in result: + new_seq = [] + for entry in seq: + new_seq.append(entry.__dict__) + new_seqs.append(new_seq) + text = json.dumps(new_seqs, sort_keys=False, indent=4) + + return Response(text, mimetype='application/json', + status=200) + + +@app.route('/help', methods=['GET', 'POST']) +def help(): + text = "NLP-Cube server\n\n" \ + "Use /nlp endpoint to process any text.\n" \ + "\tParameters:\n" \ + "\t\t lang - language code\n" \ + "\t\t text - text to process\n" \ + "\t\t format - output format for data: CONLL|JSON (default is CONLL with plain/text output)" + return Response(text, mimetype='text/plain', + status=200) + + +if __name__ == '__main__': + parser = optparse.OptionParser() + parser.add_option('--port', action='store', dest='port', type='int', default=8080, + help='Binding port for web service (default: 8080)') + parser.add_option('--host', action='store', dest='host', default='0.0.0.0', + help='Binding IP for server (default: 0.0.0.0)') + parser.add_option('--lang', action='append', dest='languages', default=[], + help='Preload language. You can use this param multiple times: --lang en --lang fr ... (default is just ["en"])') + + (params, _) = parser.parse_args(sys.argv) + + if len(params.languages) == 0: + params.languages = ['en'] + + for lang in params.languages: + lang2cube[lang] = Cube(verbose=True) + lang2cube[lang].load(lang) + + app.run(port=params.port, host=params.host) diff --git a/cube/__init__.py b/cube/__init__.py index e69de29bb..7dcc765d6 100644 --- a/cube/__init__.py +++ b/cube/__init__.py @@ -0,0 +1,12 @@ +from cube.api import _load +from cube.version import __version__ + +import logging +logger = logging.getLogger('cube') + +if logger.level == 0: + logger.setLevel(logging.INFO) + +log_handler = logging.StreamHandler() +log_formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", datefmt='%Y-%m-%d %H:%M:%S') +log_handler.setFormatter(log_formatter) \ No newline at end of file diff --git a/cube/api.py b/cube/api.py index ff06e769b..f191929e6 100644 --- a/cube/api.py +++ b/cube/api.py @@ -1,240 +1,212 @@ -# -*- coding: utf-8 -*- - +# +# Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import sys import os - -class Cube(object): - def __init__(self, verbose=False, random_seed=None, memory=512, autobatch=False, use_gpu=False): - """ - Create an empty instance for Cube - Before it can be used, you must call @method load with @param language_code set to your target language - """ - self._loaded = False - self._verbose = verbose - import dynet_config - - if random_seed != None: - if not isinstance(random_seed, int): - raise Exception ("Random seed must be an integer!") - if random_seed == 0: - print("[Warning] While Python and Numpy's seeds are now set to 0, DyNet uses 0 to reset the seed generator (fully random). Use any non-zero int value to set DyNet to a fixed random seed.") - # set python random seed - import random - random.seed(random_seed) - #set numpy random seed - import numpy as np - np.random.seed(random_seed) +import yaml +import string +import requests +import tarfile +from tqdm import tqdm + +sys.path.append('') +from typing import Optional, Union +from cube.io_utils.objects import Document, Word, Token, Sentence +from cube.io_utils.encodings import Encodings +from cube.io_utils.config import CompoundConfig, TokenizerConfig, ParserConfig, LemmatizerConfig +from cube.networks.compound import Compound +from cube.networks.parser import Parser +from cube.networks.tokenizer import Tokenizer +from cube.networks.lemmatizer import Lemmatizer +from pathlib import Path +from cube.networks.lm import LMHelperHF, LMHelperFT +from cube.networks.utils_tokenizer import TokenCollateHF, TokenCollateFTLanguasito +from cube.networks.utils import MorphoCollate, Word2TargetCollate + + +class CubeObj: + def __init__(self, model_base: str, device: str = 'cpu', lang: str = None): + self._cwe = None + # word expander + path = '{0}-trf-cwe'.format(model_base) + if os.path.exists('{0}.best'.format(path)): + config = CompoundConfig(filename='{0}.config'.format(path)) + encodings = Encodings() + encodings.load('{0}.encodings'.format(path)) + self._cwe = Compound(config, encodings) + self._cwe.load('{0}.best'.format(path)) + self._cwe.to(device) + + # tokenizer + path = '{0}-trf-tokenizer'.format(model_base) + g_conf = yaml.safe_load(open('{0}.yaml'.format(path))) + self._lang2id = {} + for lng in g_conf['language_codes']: + self._lang2id[lng] = len(self._lang2id) + self._default_lang_id = self._lang2id[g_conf['language_map'][lang]] + self._default_lang = lang + config = TokenizerConfig(filename='{0}.config'.format(path)) + print(self._default_lang_id) + lm_model = config.lm_model + encodings = Encodings() + encodings.load('{0}.encodings'.format(path)) + if lm_model.startswith('transformer'): + self._tokenizer_collate = TokenCollateHF(encodings, + lm_device=device, + lm_model=lm_model.split(':')[-1], + no_space_lang=config.no_space_lang, + lang_id=self._default_lang_id) else: - random_seed = 0 # this is the default value for DyNet (meaning full random) - - dynet_config.set(mem=memory, random_seed=random_seed, autobatch=autobatch) - if use_gpu: - dynet_config.set_gpu() - - def load(self, language_code, version="latest", local_models_repository=None, local_embeddings_file=None, tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True): - """ - Loads the pipeline with all available models for the target language. - - @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes - @param version: "latest" to get the latest version, or other specific version in like "1.0", "2.1", etc . - - """ - from .io_utils.encodings import Encodings - from .io_utils.embeddings import WordEmbeddings - from .io_utils.model_store import ModelMetadata, ModelStore - from .io_utils.config import TieredTokenizerConfig, CompoundWordConfig, LemmatizerConfig, TaggerConfig, ParserConfig - from .generic_networks.tokenizers import TieredTokenizer - from .generic_networks.token_expanders import CompoundWordExpander - from .generic_networks.lemmatizers import FSTLemmatizer - from .generic_networks.taggers import BDRNNTagger - from .generic_networks.parsers import BDRNNParser - - self._tokenizer = None # tokenizer object, default is None - self._compound_word_expander = None # compound word expander, default is None - self._lemmatizer = None # lemmatizer object, default is None - self._parser = None # parser object, default is None - self._tagger = None # tagger object, default is None - self.metadata = ModelMetadata() - - # Initialize a ModelStore object - if local_models_repository: - model_store_object = ModelStore(disk_path=local_models_repository) - else: - model_store_object = ModelStore() - - # Find a local model or download it if it does not exist, returning the local model folder path - model_folder_path = model_store_object.find(lang_code=language_code, version=version, verbose=self._verbose) - - # If the model contains metadata, load it - if os.path.isfile(os.path.join(model_folder_path, "metadata.json")): - self.metadata.read(os.path.join(model_folder_path, "metadata.json")) + self._tokenizer_collate = TokenCollateFTLanguasito(encodings, + lm_device=device, + lm_model=lm_model, + no_space_lang=config.no_space_lang, + lang_id=self._default_lang_id) + + + self._tokenizer = Tokenizer(config, encodings, language_codes=g_conf['language_codes'], + ext_word_emb=self._tokenizer_collate.get_embeddings_size()) + self._tokenizer.load('{0}.best'.format(path)) + self._tokenizer.to(device) + + # lemmatizer + path = '{0}-trf-lemmatizer'.format(model_base) + config = LemmatizerConfig(filename='{0}.config'.format(path)) + encodings = Encodings() + encodings.load('{0}.encodings'.format(path)) + self._lemmatizer = Lemmatizer(config, encodings) + self._lemmatizer.load('{0}.best'.format(path)) + self._lemmatizer.to(device) + self._lemmatizer_collate = Word2TargetCollate(encodings) + # parser-tagger + path = '{0}-trf-parser'.format(model_base) + config = ParserConfig(filename='{0}.config'.format(path)) + lm_model = config.lm_model + if lm_model.startswith('transformer'): + self._lm_helper = LMHelperHF(model=lm_model.split(':')[-1]) else: - self.metadata = None - - # Load embeddings - embeddings = WordEmbeddings(verbose=False) - if self._verbose: - sys.stdout.write('\tLoading embeddings ... \n') - if local_embeddings_file is not None: - embeddings.read_from_file(local_embeddings_file, None, full_load=False) - else: # embeddings file is not manually specified - if self.metadata is None: # no metadata exists - raise Exception("When using a locally-trained model please specify a path to a local embeddings file (local_embeddings_file cannot be None).") - else: # load from the metadata path - if self.metadata.embeddings_file_name is None or self.metadata.embeddings_file_name == "": - # load a dummy embedding - embeddings.load_dummy_embeddings() - else: - # load full embedding from file - emb_path = os.path.join(model_store_object.embeddings_repository, self.metadata.embeddings_file_name) - if not os.path.exists(emb_path): - raise Exception("Embeddings file not found: {}".format(emb_path)) - embeddings.read_from_file(emb_path, None, full_load=False) - - # 1. Load tokenizer - if tokenization: - if not os.path.isfile(os.path.join(model_folder_path, 'tokenizer-tok.bestAcc')): - sys.stdout.write('\tTokenization is not available on this model. \n') - else: - if self._verbose: - sys.stdout.write('\tLoading tokenization model ...\n') - tokenizer_encodings = Encodings(verbose=False) - tokenizer_encodings.load(os.path.join(model_folder_path, 'tokenizer.encodings')) - config = TieredTokenizerConfig(os.path.join(model_folder_path, 'tokenizer.conf')) - self._tokenizer = TieredTokenizer(config, tokenizer_encodings, embeddings, runtime=True) - self._tokenizer.load(os.path.join(model_folder_path, 'tokenizer')) - - # 2. Load compound - if compound_word_expanding: - if not os.path.isfile(os.path.join(model_folder_path, 'compound.bestAcc')): - if self._verbose: # supress warning here because many languages do not have compund words - sys.stdout.write('\tCompound word expansion is not available on this model. \n') - else: - if self._verbose: - sys.stdout.write('\tLoading compound word expander model ...\n') - compound_encodings = Encodings(verbose=False) - compound_encodings.load(os.path.join(model_folder_path, 'compound.encodings')) - config = CompoundWordConfig(os.path.join(model_folder_path, 'compound.conf')) - self._compound_word_expander = CompoundWordExpander(config, compound_encodings, embeddings, - runtime=True) - self._compound_word_expander.load(os.path.join(model_folder_path, 'compound.bestAcc')) - - # 3. Load lemmatizer - if lemmatization: - if not os.path.isfile(os.path.join(model_folder_path, 'lemmatizer.bestAcc')): - sys.stdout.write('\tLemmatizer is not available on this model. \n') - else: - if self._verbose: - sys.stdout.write('\tLoading lemmatization model ...\n') - lemmatizer_encodings = Encodings(verbose=False) - lemmatizer_encodings.load(os.path.join(model_folder_path, 'lemmatizer.encodings')) - config = LemmatizerConfig(os.path.join(model_folder_path, 'lemmatizer.conf')) - self._lemmatizer = FSTLemmatizer(config, lemmatizer_encodings, embeddings, runtime=True) - self._lemmatizer.load(os.path.join(model_folder_path, 'lemmatizer.bestAcc')) - - # 4. Load tagger - if tagging or lemmatization: # we need tagging for lemmatization - if not os.path.isfile(os.path.join(model_folder_path, 'tagger.bestUPOS')): - sys.stdout.write('\tTagging is not available on this model. \n') - if lemmatization: - sys.stdout.write('\t\tDisabling the lemmatization model due to missing tagger. \n') - self._lemmatizer = None - else: - if self._verbose: - sys.stdout.write('\tLoading tagger model ...\n') - tagger_encodings = Encodings(verbose=False) - tagger_encodings.load(os.path.join(model_folder_path, 'tagger.encodings')) - config = TaggerConfig(os.path.join(model_folder_path, 'tagger.conf')) - self._tagger = [None, None, None] - self._tagger[0] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) - self._tagger[0].load(os.path.join(model_folder_path, 'tagger.bestUPOS')) - self._tagger[1] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) - self._tagger[1].load(os.path.join(model_folder_path, 'tagger.bestXPOS')) - self._tagger[2] = BDRNNTagger(config, tagger_encodings, embeddings, runtime=True) - self._tagger[2].load(os.path.join(model_folder_path, 'tagger.bestATTRS')) - - # 5. Load parser - if parsing: - if not os.path.isfile(os.path.join(model_folder_path, 'parser.bestUAS')): - sys.stdout.write('\tParsing is not available on this model... \n') - else: - if self._verbose: - sys.stdout.write('\tLoading parser model ...\n') - parser_encodings = Encodings(verbose=False) - parser_encodings.load(os.path.join(model_folder_path, 'parser.encodings')) - config = ParserConfig(os.path.join(model_folder_path, 'parser.conf')) - self._parser = BDRNNParser(config, parser_encodings, embeddings, runtime=True) - self._parser.load(os.path.join(model_folder_path, 'parser.bestUAS')) - - self._loaded = True - if self._verbose: - sys.stdout.write('Model loading complete.\n\n') - - def __call__(self, text): - if not self._loaded: - raise Exception("Cube object is initialized but no model is loaded (eg.: call cube.load('en') )") - - sequences = [] - if self._tokenizer: - if not isinstance(text, str): - raise Exception("The text argument must be a string!") - # split text by lines - input_lines = text.split("\n") - for input_line in input_lines: - sequences+=self._tokenizer.tokenize(input_line) + self._lm_helper = LMHelperFT(model=lm_model.split(':')[-1]) + + encodings = Encodings() + encodings.load('{0}.encodings'.format(path)) + self._parser = Parser(config, encodings, language_codes=g_conf['language_codes'], + ext_word_emb=self._lm_helper.get_embedding_size()) + self._parser.load('{0}.best'.format(path)) + self._parser.to(device) + self._parser_collate = MorphoCollate(encodings) + + def __call__(self, text: Union[str, Document], flavour: Optional[str] = None): + lang_id = self._default_lang_id + if flavour is not None: + if flavour not in self._lang2id: + print("Unsupported language flavour") + print("Please choose from: {0}".format(' '.join([k for k in self._lang2id]))) + raise Exception("Unsupported language flavour\nPlease choose from: {0}". + format(' '.join([k for k in self._lang2id]))) + lang_id = self._lang2id[flavour] + if isinstance(text, str): + doc = self._tokenizer.process(text, self._tokenizer_collate, lang_id=lang_id, num_workers=0) + if self._cwe is not None: + doc = self._cwe.process(doc, self._lemmatizer_collate, num_workers=0) else: - if not isinstance(text, list): - raise Exception("The text argument must be a list of lists of tokens!") - sequences = text # the input should already be tokenized - - if self._compound_word_expander: - sequences = self._compound_word_expander.expand_sequences(sequences) - - if self._parser: - sequences = self._parser.parse_sequences(sequences) - - if self._tagger or self._lemmatizer: - import copy - new_sequences = [] - for sequence in sequences: - new_sequence = copy.deepcopy(sequence) - predicted_tags_UPOS = self._tagger[0].tag(new_sequence) - predicted_tags_XPOS = self._tagger[1].tag(new_sequence) - predicted_tags_ATTRS = self._tagger[2].tag(new_sequence) - for entryIndex in range(len(new_sequence)): - new_sequence[entryIndex].upos = predicted_tags_UPOS[entryIndex][0] - new_sequence[entryIndex].xpos = predicted_tags_XPOS[entryIndex][1] - new_sequence[entryIndex].attrs = predicted_tags_ATTRS[entryIndex][2] - new_sequences.append(new_sequence) - sequences = new_sequences - - if self._lemmatizer: - sequences = self._lemmatizer.lemmatize_sequences(sequences) - - return sequences - - -if __name__ == "__main__": - cube = Cube(verbose=True) - cube.load('en', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) - cube.metadata.info() - - text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." - - sentences = cube(text) - - for sentence in sentences: - print() - for token in sentence: - line = "" - line += str(token.index) + "\t" - line += token.word + "\t" - line += token.lemma + "\t" - line += token.upos + "\t" - line += token.xpos + "\t" - line += token.attrs + "\t" - line += str(token.head) + "\t" - line += token.label + "\t" - line += token.deps + "\t" - line += token.space_after - print(line) + doc = text + + self._lm_helper.apply(doc) + self._parser.process(doc, self._parser_collate, num_workers=0) + self._lemmatizer.process(doc, self._lemmatizer_collate, num_workers=0) + return doc + + +def _download_file(url: str, filename: str, description=None): + r = requests.get(url, stream=True) + if r.status_code != 200: + raise Exception(f"Error getting {url}, received status_code {r.status_code}") + file_size = int(r.headers['Content-Length']) + chunk_size = 1024 + + with open(filename, 'wb') as fp: + with tqdm(total=file_size, unit='B', unit_scale=True, desc=description, unit_divisor=1024, + disable=True if description is None else False, leave=False) as progressbar: + for chunk in r.iter_content(chunk_size=chunk_size): + if chunk is not None: + fp.write(chunk) + fp.flush() + progressbar.update(len(chunk)) + + return r.status_code + + +def _download_model(local_path, lang): + download_base = "https://github.com/adobe/NLP-Cube-Models/raw/3.0/models/{0}.tar.gz-a".format(lang) + file_base = "{0}.tar.gz-a".format(lang) + terminations = string.ascii_lowercase[:20] + file_list = [] + for t in terminations: + download_url = '{0}{1}'.format(download_base, t) + target_file = str(os.path.join(local_path, file_base)) + target_file = '{0}{1}'.format(target_file, t) + try: + if _download_file(download_url, target_file, description='Part {0}'.format(t)) != 200: + break + except: + break + file_list.append(target_file) + + target_file = os.path.join(local_path, file_base[:-2]) + + f_out = open(target_file, 'wb') + for file in file_list: + f_in = open(file, 'rb') + while True: + buffer = f_in.read(1024 * 1024) + if not buffer: + break + f_out.write(buffer) + f_out.close() + + tar = tarfile.open(target_file, 'r:gz') + tar.extractall(local_path) + tar.close() + + +def _load(lang: str, device: Optional[str] = 'cpu') -> CubeObj: + try: + local_user_home = str(Path.home()) + local_user_storage = os.path.join(local_user_home, '.nlpcube', '3.0') + os.makedirs(local_user_storage, exist_ok=True) + lang_path = os.path.join(local_user_storage, lang) + if not os.path.exists(lang_path): + _download_model(local_user_storage, lang) + + return CubeObj('{0}/{1}'.format(lang_path, lang), device=device, lang=lang) + except: + raise Exception("There was a problem retrieving this language. Either it is unsupported or your Internet " + "connection is down.\n\nTo check for supported languages, visit " + "https://github.com/adobe/NLP-Cube/\n\nIt is hard to maintain models for all UD Treebanks. " + "This is way we are only including a handful of" + "languages with the official distribution. " + "However, we can include additional languages upon request" + "\n\nTo make a request for supporting a new language please create an issue on GitHub") + + +class Cube: + def __init__(self, verbose=False): + self._instance = None + + def load(self, lang: str, device: Optional[str] = 'cpu'): + self._instance = _load(lang, device) + + def __call__(self, text: Union[str, Document], flavour: Optional[str] = None): + return self._instance(text, flavour=flavour) diff --git a/cube/generic_networks/parsers.py b/cube/generic_networks/parsers.py deleted file mode 100644 index e82ab7c4e..000000000 --- a/cube/generic_networks/parsers.py +++ /dev/null @@ -1,521 +0,0 @@ -# -# Author: Tiberiu Boros -# -# Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numpy as np -import random -import dynet as dy -from cube.generic_networks.character_embeddings import CharacterNetwork -from cube.graph.decoders import GreedyDecoder -from cube.generic_networks.utils import orthonormal_VanillaLSTMBuilder -import copy -import sys - - -class BDRNNParser: - def __init__(self, parser_config, encodings, embeddings, aux_softmax_weight=0.2, runtime=False): - self.config = parser_config - self.encodings = encodings - self.embeddings = embeddings - self.decoder = GreedyDecoder() - - self.model = dy.Model() - - # self.trainer = dy.SimpleSGDTrainer(self.model) - self.trainer = dy.AdamTrainer(self.model, alpha=2e-3, beta_1=0.9, beta_2=0.9) - - self.trainer.set_sparse_updates(False) - self.character_network = CharacterNetwork(100, encodings, rnn_size=200, rnn_layers=1, - embeddings_size=self.config.input_embeddings_size, - model=self.model, runtime=runtime) - - self.holistic_embeddings = self.model.add_lookup_parameters( - (len(self.encodings.word2int), self.config.input_embeddings_size)) - - self.input_proj_w_word = self.model.add_parameters( - (self.config.input_embeddings_size, self.embeddings.word_embeddings_size)) - self.input_proj_b_word = self.model.add_parameters((self.config.input_embeddings_size)) - - self.unknown_word_embedding = self.model.add_lookup_parameters( - (3, self.config.input_embeddings_size)) # for padding lexical - self.pad_tag_embedding = self.model.add_lookup_parameters( - (3, self.config.input_embeddings_size)) # for padding morphology - - self.bdrnn_fw = [] - self.bdrnn_bw = [] - - rnn_input_size = 0 - if self.config.use_lexical: - rnn_input_size += self.config.input_embeddings_size - - if self.config.use_morphology: - rnn_input_size += self.config.input_embeddings_size - self.upos_lookup = self.model.add_lookup_parameters( - (len(self.encodings.upos2int), self.config.input_embeddings_size)) - self.xpos_lookup = self.model.add_lookup_parameters( - (len(self.encodings.xpos2int), self.config.input_embeddings_size)) - self.attrs_lookup = self.model.add_lookup_parameters( - (len(self.encodings.attrs2int), self.config.input_embeddings_size)) - - index = 0 - aux_proj_input_size = 0 - for layer_size in self.config.layers: - if runtime: - self.bdrnn_fw.append(dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) - self.bdrnn_bw.append(dy.VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) - else: - self.bdrnn_fw.append(orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) - self.bdrnn_bw.append(orthonormal_VanillaLSTMBuilder(1, rnn_input_size, layer_size, self.model)) - rnn_input_size = layer_size * 2 - index += 1 - if index == self.config.aux_softmax_layer: - aux_proj_input_size = rnn_input_size - - proj_input_size = self.config.layers[-1] * 2 - - self.proj_arc_w_head = self.model.add_parameters((self.config.arc_proj_size, proj_input_size)) - self.proj_arc_b_head = self.model.add_parameters((self.config.arc_proj_size)) - self.proj_arc_w_dep = self.model.add_parameters((self.config.arc_proj_size, proj_input_size)) - self.proj_arc_b_dep = self.model.add_parameters((self.config.arc_proj_size)) - self.proj_label_w_head = self.model.add_parameters((self.config.label_proj_size, proj_input_size)) - self.proj_label_b_head = self.model.add_parameters((self.config.label_proj_size)) - self.proj_label_w_dep = self.model.add_parameters((self.config.label_proj_size, proj_input_size)) - self.proj_label_b_dep = self.model.add_parameters((self.config.label_proj_size)) - if not self.config.predict_morphology: - self.aux_proj_arc_w_head = self.model.add_parameters((self.config.arc_proj_size, aux_proj_input_size)) - self.aux_proj_arc_b_head = self.model.add_parameters((self.config.arc_proj_size)) - self.aux_proj_arc_w_dep = self.model.add_parameters((self.config.arc_proj_size, aux_proj_input_size)) - self.aux_proj_arc_b_dep = self.model.add_parameters((self.config.arc_proj_size)) - else: - self.upos_proj_w = self.model.add_parameters((self.config.label_proj_size, aux_proj_input_size)) - self.xpos_proj_w = self.model.add_parameters((self.config.label_proj_size, aux_proj_input_size)) - self.attrs_proj_w = self.model.add_parameters((self.config.label_proj_size, aux_proj_input_size)) - self.upos_proj_b = self.model.add_parameters((self.config.label_proj_size)) - self.xpos_proj_b = self.model.add_parameters((self.config.label_proj_size)) - self.attrs_proj_b = self.model.add_parameters((self.config.label_proj_size)) - - self.link_b = self.model.add_parameters((1, self.config.arc_proj_size)) - self.link_w = self.model.add_parameters((self.config.arc_proj_size, self.config.arc_proj_size)) - - self.label_ww = self.model.add_parameters((1, len(self.encodings.label2int))) - self.label_w = self.model.add_parameters((len(self.encodings.label2int), self.config.label_proj_size * 2)) - self.label_bb = self.model.add_parameters((len(self.encodings.label2int))) - - if not self.config.predict_morphology: - self.aux_link_w = self.model.add_parameters((self.config.arc_proj_size, self.config.arc_proj_size)) - self.aux_link_b = self.model.add_parameters((1, self.config.arc_proj_size)) - else: - self.upos_softmax_w = self.model.add_parameters((len(self.encodings.upos2int), self.config.label_proj_size)) - self.xpos_softmax_w = self.model.add_parameters((len(self.encodings.xpos2int), self.config.label_proj_size)) - self.attrs_softmax_w = self.model.add_parameters( - (len(self.encodings.attrs2int), self.config.label_proj_size)) - - self.upos_softmax_b = self.model.add_parameters((len(self.encodings.upos2int))) - self.xpos_softmax_b = self.model.add_parameters((len(self.encodings.xpos2int))) - self.attrs_softmax_b = self.model.add_parameters((len(self.encodings.attrs2int))) - self.lemma_softmax_b = self.model.add_parameters((len(self.encodings.char2int) + 1)) - self.lemma_softmax_casing_b = self.model.add_parameters((2)) - - self.aux_softmax_weight = aux_softmax_weight - self.batch_loss = [] - - def start_batch(self): - dy.renew_cg() - self.batch_loss = [] - - def end_batch(self): - if len(self.batch_loss) > 0: - loss = dy.esum(self.batch_loss) - loss_val = loss.value() - loss.backward() - self.trainer.update() - return loss_val - else: - return 0 - - def learn(self, seq): - # remove compound words - tmp = [] - for ss in seq: - if not ss.is_compound_entry: - tmp.append(ss) - seq = tmp - arc_matrix, aux_arc_matrix, proj_labels, softmax_morphology = self._predict_arc(seq, runtime=False) - gold_heads = [entry.head for entry in seq] - gold_labels = [entry.label for entry in seq] - - softmax_labels = self._predict_label(gold_heads, proj_labels, runtime=False) - - losses = [] - - for gold_head, gold_label, arc_probs, softmax_label, entry in zip(gold_heads, gold_labels, - arc_matrix[1:], - softmax_labels, seq): - label_index = self.encodings.label2int[gold_label] - losses.append(-dy.log(arc_probs[gold_head])) - losses.append(-dy.log(dy.pick(softmax_label, label_index))) - - if not self.config.predict_morphology: - for gold_head, aux_probs, entry in zip(gold_heads, aux_arc_matrix[ - 1:], seq): - losses.append(-dy.log(aux_probs[gold_head]) * self.aux_softmax_weight) - - else: - for softmax_morph, entry in zip(softmax_morphology, seq): - loss_upos = -dy.log(dy.pick(softmax_morph[0], self.encodings.upos2int[entry.upos])) - losses.append(loss_upos * (self.aux_softmax_weight / 3)) - - if len( - self.encodings.xpos2int) > 1: # stability check (some languages are missing attributes or XPOS, resulting in numerical overflow during backpropagation - loss_xpos = -dy.log(dy.pick(softmax_morph[1], self.encodings.xpos2int[entry.xpos])) - losses.append(loss_xpos * (self.aux_softmax_weight / 3)) - - if len( - self.encodings.attrs2int) > 1: # stability check (some languages are missing attributes or XPOS, resulting in numerical overflow during backpropagation - loss_attrs = -dy.log(dy.pick(softmax_morph[2], self.encodings.attrs2int[entry.attrs])) - losses.append(loss_attrs * (self.aux_softmax_weight / 3)) - - loss = dy.esum(losses) - self.batch_loss.append(loss) - - def _attend(self, input_vectors, state, aux_embeddings): - w1 = self.lemma_att_w1.expr(update=True) - w2 = self.lemma_att_w2.expr(update=True) - v = self.lemma_att_v.expr(update=True) - attention_weights = [] - - w2dt = w2 * dy.concatenate([state.h()[-1], aux_embeddings]) - for input_vector in input_vectors: - attention_weight = v * dy.tanh(w1 * input_vector + w2dt) - attention_weights.append(attention_weight) - - attention_weights = dy.softmax(dy.concatenate(attention_weights)) - - output_vectors = dy.esum( - [vector * attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) - - return output_vectors - - def tag(self, seq): - tmp = [] - for ss in seq: - if not ss.is_compound_entry: - tmp.append(ss) - - # if len(tmp)<2: - # print "ERRRORR" - # for entry in seq: - # print str(entry.index)+"\t"+str(entry.word) - seq = tmp - - dy.renew_cg() - arc_matrix, aux_arc_matrix, proj_labels, softmax_morphology = self._predict_arc(seq) - pred_heads = self.decoder.decode(arc_matrix) - softmax_labels = self._predict_label(pred_heads, proj_labels) - - tag_list = [] - for pred_head, softmax_label in zip(pred_heads, softmax_labels): - label_index = np.argmax(softmax_label.npvalue()) - tag = ParserTag(pred_head, self.encodings.labels[label_index], None, None, None) - tag_list.append(tag) - - if self.config.predict_morphology: - for tag, softmax_morph in zip(tag_list, softmax_morphology): - tag.upos = self.encodings.upos_list[np.argmax(softmax_morph[0].npvalue())] - tag.xpos = self.encodings.xpos_list[np.argmax(softmax_morph[1].npvalue())] - tag.attrs = self.encodings.attrs_list[np.argmax(softmax_morph[2].npvalue())] - - return tag_list - - def _predict_label(self, heads, proj_labels, runtime=True): - s_labels = [] - for iDep, iHead in zip(range(1, len(heads) + 1), heads): - modw = dy.transpose( - dy.reshape(proj_labels[iHead][1], (self.config.label_proj_size, 1)) * self.label_ww.expr(update=True)) - term1 = modw * proj_labels[iDep][0] - term2 = self.label_w.expr(update=True) * dy.concatenate([proj_labels[iHead][1], proj_labels[iDep][0]]) - term3 = self.label_bb.expr(update=True) - s_labels.append(dy.softmax(term1 + term2 + term3)) - - return s_labels - - def _make_input(self, seq, runtime): - x_list = [] - encoder_states_list = [None] - # add the root - if not self.config.use_morphology: - x_list.append(self.unknown_word_embedding[1]) - elif not self.config.use_lexical: - x_list.append(self.pad_tag_embedding[1]) - else: # both lexical and morphology are used - x_list.append(dy.concatenate( - [self.unknown_word_embedding[1], self.pad_tag_embedding[1]])) - - for entry in seq: - word = entry.word - - if self.config.use_lexical: - # prepare lexical embeddings - char_emb, encoder_states = self.character_network.compute_embeddings(word, runtime=runtime) - encoder_states_list.append(encoder_states) - if sys.version_info[0] == 2: - word_emb, found = self.embeddings.get_word_embeddings(word.decode('utf-8')) - else: - word_emb, found = self.embeddings.get_word_embeddings(word) - if not found: - word_emb = self.unknown_word_embedding[0] - else: - word_emb = dy.tanh( - self.input_proj_w_word.expr(update=True) * dy.inputVector(word_emb) + self.input_proj_b_word.expr(update=True)) - if sys.version_info[0] == 2: - word = word.decode('utf-8').lower() - else: - word = word.lower() - - if word in self.encodings.word2int: - holistic_emb = self.holistic_embeddings[self.encodings.word2int[word]] - else: - holistic_emb = self.holistic_embeddings[self.encodings.word2int['']] - - # dropout lexical embeddings - if runtime: - w_emb = word_emb + char_emb + holistic_emb - else: - p1 = random.random() - p2 = random.random() - p3 = random.random() - m1 = 1 - m2 = 1 - m3 = 1 - if p1 < self.config.input_dropout_prob: - m1 = 0 - if p2 < self.config.input_dropout_prob: - m2 = 0 - if p3 < self.config.input_dropout_prob: - m3 = 0 - - scale = 1.0 - if m1 + m2 + m3 > 0: - scale = float(3) / (m1 + m2 + m3) - m1 = dy.scalarInput(m1) - m2 = dy.scalarInput(m2) - m3 = dy.scalarInput(m3) - scale = dy.scalarInput(scale) - w_emb = (word_emb * m1 + char_emb * m2 + holistic_emb * m3) * scale - - if self.config.use_morphology: - if entry.upos in self.encodings.upos2int: - upos_emb = self.upos_lookup[self.encodings.upos2int[entry.upos]] - else: - upos_emb = dy.inputVector([0] * self.config.input_embeddings_size) - if entry.xpos in self.encodings.xpos2int: - xpos_emb = self.xpos_lookup[self.encodings.xpos2int[entry.xpos]] - else: - xpos_emb = dy.inputVector([0] * self.config.input_embeddings_size) - if entry.attrs in self.encodings.attrs2int: - attrs_emb = self.attrs_lookup[self.encodings.attrs2int[entry.attrs]] - else: - attrs_emb = dy.inputVector([0] * self.config.input_embeddings_size) - # overwrite all dropouts. it will later be handled by "same-mask" - t_emb = upos_emb + xpos_emb + attrs_emb - # w_emb = word_emb + char_emb + holistic_emb - - # compose embeddings, if necessary - if self.config.use_lexical and self.config.use_morphology: - if not runtime: - p1 = random.random() - p2 = random.random() - m1 = 1 - m2 = 1 - if p1 < self.config.input_dropout_prob: - m1 = 0 - if p2 < self.config.input_dropout_prob: - m2 = 0 - if m1 + m2 > 0: - scale = float(2.0) / (m1 + m2) - else: - scale = 1.0 - scale = dy.scalarInput(scale) - m1 = dy.scalarInput(m1) - m2 = dy.scalarInput(m2) - x_list.append(dy.concatenate([w_emb * m1 * scale, t_emb * m2 * scale])) - else: - x_list.append(dy.concatenate([w_emb, t_emb])) - elif self.config.use_lexical: # just use_lexical == True - x_list.append(w_emb) - else: # just use_morphology == True - x_list.append(t_emb) - - # close sequence - if not self.config.use_morphology: - x_list.append(self.unknown_word_embedding[2]) - elif not self.config.use_lexical: - x_list.append(self.pad_tag_embedding[2]) - else: - x_list.append( - dy.concatenate( - [self.unknown_word_embedding[2], self.pad_tag_embedding[2]])) - - encoder_states_list.append(None) - return x_list, encoder_states_list - - def _predict_arc(self, seq, runtime=True): - x_list, encoder_states_list = self._make_input(seq, runtime) - - # BDLSTM - rnn_outputs = [x_list] - for fw, bw, dropout in zip(self.bdrnn_fw, self.bdrnn_bw, self.config.layer_dropouts): - if runtime: - fw.set_dropouts(0, 0) - bw.set_dropouts(0, 0) - else: - fw.set_dropouts(dropout, dropout) - bw.set_dropouts(dropout, dropout) - - fw_list = fw.initial_state().transduce(x_list) - bw_list = list(reversed(bw.initial_state().transduce(reversed(x_list)))) - x_list = [dy.concatenate([x_fw, x_bw]) for x_fw, x_bw in zip(fw_list, bw_list)] - - rnn_outputs.append(x_list) - - # projections - arc_projections = [[dy.tanh(self.proj_arc_w_dep.expr(update=True) * x + self.proj_arc_b_dep.expr(update=True)), - dy.tanh(self.proj_arc_w_head.expr(update=True) * x + self.proj_arc_b_head.expr(update=True))] for x in - rnn_outputs[-1]] - label_projections = [[dy.tanh(self.proj_label_w_dep.expr(update=True) * x + self.proj_label_b_dep.expr(update=True)), - dy.tanh(self.proj_label_w_head.expr(update=True) * x + self.proj_label_b_head.expr(update=True))] for x in - rnn_outputs[-1]] - if not runtime: - arc_projections = [ - [dy.dropout(x1, self.config.presoftmax_mlp_dropout), dy.dropout(x2, self.config.presoftmax_mlp_dropout)] - for x1, x2 in arc_projections] - label_projections = [ - [dy.dropout(x1, self.config.presoftmax_mlp_dropout), dy.dropout(x2, self.config.presoftmax_mlp_dropout)] - for x1, x2 in label_projections] - if not self.config.predict_morphology: - aux_arc_projections = [[dy.tanh(self.aux_proj_arc_w_dep.expr(update=True) * x + self.aux_proj_arc_b_dep.expr(update=True)), - dy.tanh(self.aux_proj_arc_w_head.expr(update=True) * x + self.aux_proj_arc_b_head.expr(update=True))] - for x in rnn_outputs[self.config.aux_softmax_layer]] - if not runtime: - aux_arc_projections = [[dy.dropout(x1, self.config.presoftmax_mlp_dropout), - dy.dropout(x2, self.config.presoftmax_mlp_dropout)] for x1, x2 in - aux_arc_projections] - - else: - drp = self.config.presoftmax_mlp_dropout - if runtime: - drp = 0 - upos_softmax = [dy.softmax(self.upos_softmax_w.expr(update=True) * dy.dropout(dy.tanh( - self.upos_proj_w.expr(update=True) * x + self.upos_proj_b.expr(update=True)), drp) + self.upos_softmax_b.expr(update=True)) for x in - rnn_outputs[self.config.aux_softmax_layer]] - xpos_softmax = [dy.softmax(self.xpos_softmax_w.expr(update=True) * dy.dropout(dy.tanh( - self.xpos_proj_w.expr(update=True) * x + self.xpos_proj_b.expr(update=True)), drp) + self.xpos_softmax_b.expr(update=True)) for x in - rnn_outputs[self.config.aux_softmax_layer]] - attrs_softmax = [dy.softmax(self.attrs_softmax_w.expr(update=True) * dy.dropout(dy.tanh( - self.attrs_proj_w.expr(update=True) * x + self.attrs_proj_b.expr(update=True)), drp) + self.attrs_softmax_b.expr(update=True)) for x in - rnn_outputs[self.config.aux_softmax_layer]] - - morphology_softmax = [[upos, xpos, attrs] for - upos, xpos, attrs in - zip(upos_softmax, xpos_softmax, attrs_softmax)] - - n = len(seq) + 1 - arc_matrix = [[None] * n for _ in range(n)] - if not self.config.predict_morphology: - aux_arc_matrix = [[None] * n for _ in range(n)] - for iDst in range(n): - term_bias = self.link_b.expr(update=True) * arc_projections[iDst][1] - term_weight = self.link_w.expr(update=True) * arc_projections[iDst][1] - if not self.config.predict_morphology: - aux_term_bias = self.aux_link_b.expr(update=True) * aux_arc_projections[iDst][1] - aux_term_weight = self.aux_link_w.expr(update=True) * aux_arc_projections[iDst][1] - for iSrc in range(n): - if iSrc != iDst: - attention = dy.reshape(term_weight, (1, self.config.arc_proj_size)) * arc_projections[iSrc][ - 0] + term_bias - arc_matrix[iSrc][iDst] = attention - if not self.config.predict_morphology: - aux_attention = dy.reshape(aux_term_weight, (1, self.config.arc_proj_size)) * \ - aux_arc_projections[iSrc][0] + aux_term_bias - aux_arc_matrix[iSrc][iDst] = aux_attention - - # compute softmax for arcs - a_m = [[None] * n for _ in range(n)] - if not self.config.predict_morphology: - aux_a_m = [[None] * n for _ in range(n)] - - for iSrc in range(n): - s_max = [] - if not self.config.predict_morphology: - aux_s_max = [] - for iDst in range(n): - if iSrc != iDst: - s_max.append(arc_matrix[iSrc][iDst]) - if not self.config.predict_morphology: - aux_s_max.append(aux_arc_matrix[iSrc][iDst]) - s_max = dy.softmax(dy.concatenate(s_max)) - if not self.config.predict_morphology: - aux_s_max = dy.softmax(dy.concatenate(aux_s_max)) - ofs = 0 - for iDst in range(n): - if iSrc == iDst: - ofs = -1 - else: - a_m[iSrc][iDst] = s_max[iDst + ofs] - if not self.config.predict_morphology: - aux_a_m[iSrc][iDst] = aux_s_max[iDst + ofs] - if not self.config.predict_morphology: - return a_m, aux_a_m, label_projections, None - else: - return a_m, None, label_projections, morphology_softmax[1:-1] - - def save(self, path): - self.model.save(path) - - def load(self, path): - self.model.populate(path) - - def parse_sequences(self, sequences): - new_sequences = [] - for sequence in sequences: - new_sequence = copy.deepcopy(sequence) - predicted_tags = self.tag(new_sequence) - iOrig, iTags = 0, 0 - while iOrig < len(new_sequence): - while new_sequence[iOrig].is_compound_entry: - iOrig += 1 - new_sequence[iOrig].head = predicted_tags[iTags].head - new_sequence[iOrig].label = predicted_tags[iTags].label - if self.config.predict_morphology == True: - new_sequence[iOrig].upos = predicted_tags[iTags].upos - new_sequence[iOrig].xpos = predicted_tags[iTags].xpos - new_sequence[iOrig].attrs = predicted_tags[iTags].attrs - iTags += 1 - iOrig += 1 - - new_sequences.append(new_sequence) - return new_sequences - - -class ParserTag: - def __init__(self, head, label, upos=None, xpos=None, attrs=None, lemma=None): - self.head = head - self.label = label - self.upos = upos - self.xpos = xpos - self.attrs = attrs - self.lemma = lemma diff --git a/cube/io_utils/components.py b/cube/io_utils/components.py new file mode 100644 index 000000000..30dcc802e --- /dev/null +++ b/cube/io_utils/components.py @@ -0,0 +1,93 @@ +from cube.data.objects import Doc, Sentence, Word, Token + + +class ComponentType: + Tokenizer = 1 + CWExpander = 2 + POSTagger = 3 + Lemmatizer = 4 + Parser = 5 + NER = 6 + + +class Model: + def __init__(self): + pass + + def __call__(self, input_object, **kwargs): + return None + + +class Component(): + def __init__(self, use_gpu: bool = True, gpu_batch_size: int = 1): + self.input_format = Doc # type of data object or str + self.output_format = Doc # type of data object + self.depends = [] # list of other components + self.provides = [] # list of other components + + self.model_filepath = None + + self.use_gpu = use_gpu + self.gpu_batch_size = gpu_batch_size + + self.model = None + + def load_model(self, model_path): + pass + + def process(self, input_object): + assert (self.model is not None), "Model is none, please load model first" + return self.model(input_object=input_object) + + +class TokenizerComponent(Component): + def __init__(self): + super().__init__() + self.input_format = str + self.depends = [] + self.provides = [ComponentType.Tokenizer] + + +class CWExpanderComponent(Component): + def __init__(self): + super().__init__() + self.depends = [ComponentType.Tokenizer] + self.provides = [ComponentType.CWExpander] + + +class POSTaggerComponent(Component): + def __init__(self): + super().__init__() + self.depends = [ComponentType.Tokenizer, ComponentType.CWExpander] + self.provides = [ComponentType.POSTagger] + + +class LemmatizerComponent(Component): + def __init__(self): + super().__init__() + self.depends = [ComponentType.Tokenizer, ComponentType.CWExpander, ComponentType.Parser] + self.provides = [ComponentType.Lemmatizer] + + +class ParserComponent(Component): + def __init__(self): + super().__init__() + self.depends = [ComponentType.Tokenizer, ComponentType.CWExpander] + self.provides = [ComponentType.Parser] + + +class NERComponent(Component): + def __init__(self): + super().__init__() + self.depends = [ComponentType.Tokenizer, ComponentType.CWExpander, ComponentType.Parser] + self.provides = [ComponentType.NER] + + +class Pipeline(): + def is_valid(components: Component): + available = set() + required = set() + for component in components: + available |= set(component.provides) + required |= set(component.depends) + return required.issubset(available) diff --git a/cube/io_utils/config.py b/cube/io_utils/config.py index fd88b6c3d..9a3cd0d97 100644 --- a/cube/io_utils/config.py +++ b/cube/io_utils/config.py @@ -19,7 +19,7 @@ import sys import ast from builtins import object, super -from cube.misc.misc import fopen +from cube.io_utils.misc import fopen import collections if sys.version_info[0] == 2: @@ -84,41 +84,18 @@ def load(self, filename): class TokenizerConfig(Config): def __init__(self, filename=None, verbose=False): super().__init__() - - self.base = "" - # encoder-char - self.char_vocabulary_size = -1 # to be calculated when first training - self.char_embedding_size = 100 - self.char_generic_feature_vocabulary_size = 2 - self.char_generic_feature_embedding_size = 5 - - self.encoder_char_input_attribute_dropout = 0. - self.encoder_char_lstm_size = 200 - - # next-chars - self.next_chars_embedding_size = 100 - self.next_chars_window_size = 10 - - # encoder-word - self.encoder_word_input_w2i_array = {} - self.encoder_word_vocab_size = 0 # ref - self.encoder_word_embedding_size = 0 # ref - self.encoder_word_lstm_size = 200 - # decoder - self.decoder_attribute_dropout = 0.33 - self.decoder_hidden_size = 20 - - self.dropout_rate = 0 - # extra - self.patience = -1 - self.tokenize_maximum_sequence_length = 500 # how much to run predict on, at a time + self.cnn_filter = 512 + self.lang_emb_size = 100 + self.cnn_layers = 5 + self.external_proj_size = 300 + self.no_space_lang = False if filename is None: if verbose: sys.stdout.write("No configuration file supplied. Using default values.\n") else: if verbose: - sys.stdout.write("Reading configuration file " + filename + "\n") + sys.stdout.write("Reading configuration file " + filename + " \n") self.load(filename) self._valid = True @@ -127,14 +104,16 @@ def __init__(self, filename=None, verbose=False): class TaggerConfig(Config): def __init__(self, filename=None, verbose=False): super().__init__() - self.layers = [200, 200] - self.layer_dropouts = [0.5, 0.5] - self.aux_softmax_layer = 1 + self.char_emb_size = 256 + self.char_filter_size = 512 + self.char_layers = 3 + self.word_emb_size = 256 + self.lang_emb_size = 64 + self.cnn_filter = 512 + self.cnn_layers = 5 + self.external_proj_size = 300 + self.lm_model = 'xlm-roberta-base' self._valid = True - self.input_dropout_prob = 0.33 - self.presoftmax_mlp_layers = [500] - self.presoftmax_mlp_dropouts = [0.5] - self.input_size = 100 if filename is None: if verbose: @@ -144,36 +123,28 @@ def __init__(self, filename=None, verbose=False): sys.stdout.write("Reading configuration file " + filename + " \n") self.load(filename) - if verbose: - print ("INPUT SIZE:", self.input_size) - print ("LAYERS:", self.layers) - print ("LAYER DROPOUTS:", self.layer_dropouts) - print ("AUX SOFTMAX POSITION:", self.aux_softmax_layer) - print ("INPUT DROPOUT PROB:", self.input_dropout_prob) - print ("PRESOFTMAX MLP LAYERS:", self.presoftmax_mlp_layers) - print ("PRESOFTMAX MLP DROPOUT:", self.presoftmax_mlp_dropouts) - - if self.aux_softmax_layer > len(self.layers) - 1 or self.aux_softmax_layer == 0: - print ( - "Configuration error: aux softmax layer must be placed after the first layer and before the final one.") - self._valid = False - class ParserConfig(Config): def __init__(self, filename=None, verbose=False): super().__init__() - self.layers = [300, 300, 200, 200, 200] - self.layer_dropouts = [0.33, 0.33, 0.33, 0.33, 0.33] - self.aux_softmax_layer = 2 + self.char_emb_size = 256 + self.char_filter_size = 512 + self.char_layers = 5 + self.word_emb_size = 256 + self.lang_emb_size = 64 + self.cnn_filter = 512 + self.cnn_layers = 5 + self.aux_softmax_location = 5 + self.pre_parser_size = 500 + self.head_size = 100 + self.label_size = 200 + self.lm_model = 'xlm-roberta-base' + self.external_proj_size = 300 + self.rhl_win_size = 2 + self.rnn_size = 50 + self.rnn_layers = 3 + self._valid = True - self.input_dropout_prob = 0.33 - self.arc_proj_size = 100 - self.label_proj_size = 400 - self.presoftmax_mlp_dropout = 0.33 - self.predict_morphology = True - self.use_morphology = False - self.use_lexical = True - self.input_embeddings_size = 100 if filename is None: if verbose: @@ -183,37 +154,40 @@ def __init__(self, filename=None, verbose=False): sys.stdout.write("Reading configuration file " + filename + " \n") self.load(filename) - if verbose: - print ("LAYERS:", self.layers) - print ("LAYER DROPOUTS:", self.layer_dropouts) - print ("AUX SOFTMAX POSITION:", self.aux_softmax_layer) - print ("INPUT DROPOUT PROB:", self.input_dropout_prob) - print ("ARC PROJECTION SIZE:", self.arc_proj_size) - print ("LABEL PROJECTION SIZE:", self.label_proj_size) - print ("PRESOFTMAX MLP DROPOUT:", self.presoftmax_mlp_dropout) - print ("JOINTLY PARSE AND PREDICT MORPHOLOGY:", self.predict_morphology) - print ("USE MORPHOLOGY AS INPUT:", self.use_morphology) - print ("INPUT EMBEDDINGS SIZE:", self.input_embeddings_size) - if self.aux_softmax_layer > len(self.layers) - 1 or self.aux_softmax_layer == 0: - print ( - "Configuration error: aux softmax layer must be placed after the first layer and before the final one.") - self._valid = False +class LemmatizerConfig(Config): + def __init__(self, filename=None, verbose=False): + super().__init__() + self.encoder_layers = 2 + self.encoder_size = 200 + self.decoder_layers = 2 + self.decoder_size = 400 + self.att_proj_size = 100 + self.upos_emb_size = 100 + self.lang_emb_size = 100 + self.char_emb_size = 100 + self._valid = True - if self.use_morphology and self.predict_morphology: - print ("Configuration error: you are using morphology to predict morphology.") - self._valid = False + if filename is None: + if verbose: + sys.stdout.write("No configuration file supplied. Using default values.\n") + else: + if verbose: + sys.stdout.write("Reading configuration file " + filename + " \n") + self.load(filename) -class LemmatizerConfig(Config): +class CompoundConfig(Config): def __init__(self, filename=None, verbose=False): super().__init__() - self.rnn_size = 200 - self.rnn_layers = 2 - self.char_embeddings = 100 - self.char_rnn_size = 200 - self.char_rnn_layers = 2 - self.tag_embeddings_size = 100 + self.encoder_layers = 2 + self.encoder_size = 200 + self.decoder_layers = 2 + self.decoder_size = 400 + self.att_proj_size = 100 + self.lang_emb_size = 100 + self.char_emb_size = 100 + self._valid = True if filename is None: if verbose: diff --git a/cube/io_utils/encodings.py b/cube/io_utils/encodings.py index c11299947..b7b9133e6 100644 --- a/cube/io_utils/encodings.py +++ b/cube/io_utils/encodings.py @@ -1,30 +1,12 @@ -# -# Author: Tiberiu Boros -# -# Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - +import logging, re import sys -import re -from cube.misc.misc import fopen - +sys.path.append('') +from cube.io_utils.objects import Document -class Encodings(object): - def __init__(self, verbose=True): +class Encodings: + def __init__(self, verbose=False): self.word_list = {} self.hol_word_list = [] self.char2int = {} @@ -39,33 +21,56 @@ def __init__(self, verbose=True): self.attrs_list = [] self.characters = [] self.verbose = verbose + self.num_langs = 0 - def compute(self, train, dev, tag_type=None, word_cutoff=7, char_cutoff=5, CUPT_format=False): + def compute(self, train: Document, dev: Document, word_cutoff=7, char_cutoff=5, CUPT_format=False): if self.verbose: - sys.stdout.write("Computing encoding maps... ") - sys.stdout.flush() - self.char2int[''] = 0 + print("Computing encoding maps... ") + + self.word2int[''] = 0 + self.hol_word_list.append('') + self.word2int[''] = 1 + self.hol_word_list.append('') + self.char2int[''] = 0 + self.char2int[''] = 1 + self.char2int[' '] = 2 + self.upos2int[''] = 0 + self.upos_list.append('') + self.xpos2int[''] = 0 + self.xpos_list.append('') + self.attrs2int[''] = 0 + self.attrs_list.append('') + self.upos2int[''] = 1 + self.upos_list.append('') + self.xpos2int[''] = 1 + self.xpos_list.append('') + self.attrs2int[''] = 1 + self.attrs_list.append('') + self.label2int[''] = 0 + self.labels.append('') + self.label2int[''] = 1 + self.labels.append('') + + self.characters.append("") self.characters.append("") - self.char2int[' '] = 1 self.characters.append(" ") char_count = {} word_count = {} - for seq in train.sequences: - for entry in seq: - if sys.version_info[0] == 2: - word = entry.word.decode('utf-8').lower() - else: - word = entry.word.lower() + + for sentence in train.sentences: # xxx + lang_id = sentence.lang_id + if lang_id + 1 > self.num_langs: + self.num_langs = lang_id + 1 + for entry in sentence.words: # entry is a Word + word = entry.word.lower() if word not in word_count: word_count[word] = 1 else: word_count[word] = word_count[word] + 1 if word not in self.word_list: self.word_list[word] = 0 # word is inside trainset - if sys.version_info[0] == 2: - uniword = unicode(entry.word, 'utf-8').lower() - else: - uniword = entry.word.lower() + + uniword = entry.word.lower() uniword = re.sub('\d', '0', uniword) for i in range(len(uniword)): char = uniword[i].lower() @@ -76,15 +81,8 @@ def compute(self, train, dev, tag_type=None, word_cutoff=7, char_cutoff=5, CUPT_ # if char not in self.char2int: # self.char2int[char] = len(self.char2int) - label = None - if tag_type == 'upos': - label = entry.upos - elif tag_type == 'xpos': - label = entry.xpos - elif tag_type == 'attrs': - label = entry.attrs - elif tag_type == 'label': - label = entry.label + + label = entry.label if CUPT_format and tag_type == 'label': if entry.label != "*": @@ -107,18 +105,14 @@ def compute(self, train, dev, tag_type=None, word_cutoff=7, char_cutoff=5, CUPT_ if entry.attrs not in self.attrs2int: self.attrs2int[entry.attrs] = len(self.attrs2int) self.attrs_list.append(entry.attrs) - - for seq in dev.sequences: - for entry in seq: - if sys.version_info[0] == 2: - word = entry.word.decode('utf-8').lower() - else: + if dev is not None: + for sentence in dev.sentences: + lang_id = sentence._lang_id + for entry in sentence.words: word = entry.word.lower() - if word not in self.word_list: - self.word_list[word] = 1 # word is inside devset only + if word not in self.word_list: + self.word_list[word] = 1 # word is inside devset only - self.word2int[''] = 0 - self.hol_word_list.append('') for word in word_count: if word_count[word] >= word_cutoff: self.word2int[word] = len(self.word2int) @@ -135,35 +129,32 @@ def compute(self, train, dev, tag_type=None, word_cutoff=7, char_cutoff=5, CUPT_ self.char2int[ds] = len(self.char2int) self.characters.append(ds) if self.verbose: - sys.stdout.write("done\n") + print("done\n") - print ("Unique words: " + str(len(self.word_list))) - print ("Unique chars: " + str(len(self.char2int))) - print ("Unique labels: " + str(len(self.label2int))) - print ("Unique UPOS: " + str(len(self.upos2int))) - print ("Unique XPOS: " + str(len(self.xpos2int))) - print ("Unique ATTRS: " + str(len(self.attrs2int))) - print ("Holistic word count: " + str(len(self.word2int))) + print("Unique words: " + str(len(self.word_list))) + print("Unique chars: " + str(len(self.char2int))) + print("Unique labels: " + str(len(self.label2int))) + print("Unique UPOS: " + str(len(self.upos2int))) + print("Unique XPOS: " + str(len(self.xpos2int))) + print("Unique ATTRS: " + str(len(self.attrs2int))) + print("Holistic word count: " + str(len(self.word2int))) def update_wordlist(self, dataset): for seq in dataset.sequences: for entry in seq: - import sys - if sys.version_info[0] == 2: - word = entry.word.decode('utf-8').lower() - else: - word = entry.word.lower() + word = entry.word.lower() if word not in self.word_list: self.word_list[word] = 2 # word is inside an auxiliarly set (probably test) def load(self, filename): # We only read character2int, labels, holistic words and label2int here. word_list should be recomputed for every dataset (if deemed necessary) - with fopen(filename, "r") as f: + with open(filename, "r", encoding="utf8") as f: + line = f.readline() + self.num_langs = int(line.strip().split(' ')[-1]) line = f.readline() - num_labels = int(line.split(" ")[1]) if self.verbose: - print ("Loading labels " + str(num_labels)) + print("Loading labels " + str(num_labels)) self.labels = [""] * num_labels for _ in range(num_labels): line = f.readline() @@ -177,30 +168,22 @@ def load(self, filename): num_characters = int(line.split(" ")[1]) self.characters = [""] * num_characters if self.verbose: - print ("Loading characters " + str(num_characters)) + print("Loading characters " + str(num_characters)) for _ in range(num_characters): line = f.readline() parts = line.split("\t") - import sys - if sys.version_info[0] == 2: - key = parts[0].decode('utf-8') - else: - key = parts[0] + key = parts[0] value = int(parts[1]) self.char2int[key] = value self.characters[value] = key line = f.readline() num_words = int(line.split(" ")[1]) if self.verbose: - print ("Loading words " + str(num_words)) + print("Loading words " + str(num_words)) for _x in range(num_words): line = f.readline() parts = line.split("\t") - import sys - if sys.version_info[0] == 2: - key = parts[0].decode('utf-8') - else: - key = parts[0] + key = parts[0] value = int(parts[1]) self.word2int[key] = value @@ -208,7 +191,7 @@ def load(self, filename): line = f.readline() num_labels = int(line.split(" ")[1]) if self.verbose: - print ("Loading upos " + str(num_labels)) + print("Loading upos " + str(num_labels)) self.upos_list = [""] * num_labels for _ in range(num_labels): line = f.readline() @@ -222,7 +205,7 @@ def load(self, filename): num_labels = int(line.split(" ")[1]) self.xpos_list = [""] * num_labels if self.verbose: - print ("Loading xpos " + str(num_labels)) + print("Loading xpos " + str(num_labels)) for _ in range(num_labels): line = f.readline() parts = line.split("\t") @@ -235,7 +218,7 @@ def load(self, filename): num_labels = int(line.split(" ")[1]) self.attrs_list = [""] * num_labels if self.verbose: - print ("Loading attrs " + str(num_labels)) + print("Loading attrs " + str(num_labels)) for _ in range(num_labels): line = f.readline() parts = line.split("\t") @@ -246,22 +229,17 @@ def load(self, filename): f.close() def save(self, filename): - f = fopen(filename, "w") + f = open(filename, "w", encoding="utf8") + f.write("LANGS " + str(self.num_langs) + "\n") f.write("LABELS " + str(len(self.label2int)) + "\n") for label in self.label2int: f.write(str(label) + "\t" + str(self.label2int[label]) + "\n") f.write("CHARACTERS " + str(len(self.char2int)) + "\n") for character in self.char2int: - if sys.version_info[0] == 2: - f.write(character.encode('utf-8') + "\t" + str(self.char2int[character]) + "\n") - else: - f.write(character + "\t" + str(self.char2int[character]) + "\n") + f.write(character + "\t" + str(self.char2int[character]) + "\n") f.write("WORDS " + str(len(self.word2int)) + "\n") for word in self.word2int: - if sys.version_info[0] == 2: - f.write(word.encode('utf-8') + "\t" + str(self.word2int[word]) + "\n") - else: - f.write(word + "\t" + str(self.word2int[word]) + "\n") + f.write(word + "\t" + str(self.word2int[word]) + "\n") f.write("UPOS " + str(len(self.upos2int)) + "\n") for label in self.upos2int: diff --git a/cube/io_utils/misc.py b/cube/io_utils/misc.py new file mode 100644 index 000000000..bd99ca80c --- /dev/null +++ b/cube/io_utils/misc.py @@ -0,0 +1,38 @@ +import sys, argparse + + +def fopen(filename, mode="r"): + if sys.version_info[0] == 2: + return open(filename, mode) + else: + if "b" in mode.lower(): + return open(filename, mode) + else: + return open(filename, mode, encoding="utf-8") + + +class ArgParser(): + def __init__(self): + self.parser = argparse.ArgumentParser(description='Tagger ') + self.parser.add_argument('--train', action='store', dest='train_file', + help='Start building a tagger model') + self.parser.add_argument('--patience', action='store', type=int, default=20, dest='patience', + help='Number of epochs before early stopping (default=20)') + self.parser.add_argument('--store', action='store', dest='store', help='Output base', default='data/model') + self.parser.add_argument('--gpus', action='store', dest='gpus', type=int, + help='How many GPUs to use (default=1)', default=1) + self.parser.add_argument('--num-workers', action='store', dest='num_workers', type=int, + help='How many dataloaders to use (default=4)', default=4) + self.parser.add_argument('--batch-size', action='store', type=int, default=16, dest='batch_size', + help='Batch size (default=16)') + self.parser.add_argument('--debug', action='store_true', dest='debug', + help='Do some standard stuff to debug the model') + self.parser.add_argument('--resume', action='store_true', dest='resume', help='Resume training') + self.parser.add_argument('--lm-model', action='store', dest='lm_model', default='transformer:xlm-roberta-base', + help='What LM model to use (default=transformer:xlm-roberta-base)') + self.parser.add_argument('--lm-device', action='store', dest='lm_device', default='cuda:0', + help='Where to load LM (default=cuda:0)') + self.parser.add_argument('--config', action='store', dest='config_file', help='Load config file') + + def __call__(self, *args, **kwargs): + return self.parser.parse_args() diff --git a/cube/io_utils/modelstore.py b/cube/io_utils/modelstore.py new file mode 100644 index 000000000..32fe032db --- /dev/null +++ b/cube/io_utils/modelstore.py @@ -0,0 +1,278 @@ +import os, sys, logging, json, requests, uuid, shutil +import zipfile +from pathlib import Path +from typing import Optional, List, Tuple +from tqdm.autonotebook import tqdm as tqdm + +logger = logging.getLogger('cube') + +class ModelStore: + """ + The purpose of this class is to be called from the api.load(language) and give back a list of component paths, + or download them if they are not locally stored + + """ + root_path = os.path.join(str(Path.home()), ".nlpcube") + catalog_url = "https://raw.githubusercontent.com/adobe/NLP-Cube-Models/3.0/models/catalog.json" + + @staticmethod + def solve(lang: str, version: str = "latest", check_for_latest: bool = False) -> Tuple[dict, int]: + """ + TODO docs + :param lang: Language + :param version: force particular version, else latest + :return: Dict of paths, language id + """ + paths = { + "tokenizer": {}, + "cwe": {}, + "lemmatizer": {}, + "tagger": {}, + "parser": {} + } + + # check local catalog and download if not present + catalog = ModelStore._get_catalog(check_for_latest = check_for_latest) + + # identify all entries that match this lang and select the appropriate one + if lang not in catalog: + raise Exception(f"Language '{lang}' is not available!") + + entries = catalog[lang] + if len(entries) == 0: + raise Exception(f"Language '{lang}' is not available!") + + found = False + if version != "latest": # check for specific version + for entry in entries: + if entry["version"] == version: + found = True + model_url = entry["link"] + langid = entry["langid"] + parts = entry["parts"] + else: + max_version = 0.0 + for entry in entries: # check for latest version + try: # get version + entry_version = float(entry["version"]) + if entry_version > max_version: + found = True + max_version = entry_version + model_url = entry["link"] + langid = entry["langid"] + parts = entry["parts"] + except: + pass + version = max_version + + if not found: + raise Exception(f"Language '{lang}', version '{version}' not found, but there are other versions available!") + + # check if it's present, else download model in folder "name.version", return abspath of model folder + model_folder = ModelStore._get_model(model_url, parts) + + # compose and return paths + files = [x for x in os.listdir(model_folder)] # file names only + + #files = [os.path.abspath(os.path.join(model_folder, x)) for x in files] # get full path + + tokenizer_files = [x for x in files if "tokenizer" in x] + cwe_files = [x for x in files if "cwe" in x] + lemmatizer_files = [x for x in files if "lemmatizer" in x] + tagger_files = [x for x in files if "tagger" in x] + parser_files = [x for x in files if "parser" in x] + + if len(tokenizer_files) > 0: + tokenizer_entry = {"config": ModelStore.__get_file_path(tokenizer_files, ".config", model_folder), + "encodings": ModelStore.__get_file_path(tokenizer_files, ".encodings", model_folder), + #"sent": ModelStore.__get_file_path(tokenizer_files, ".sent", model_folder), + "model": ModelStore.__get_file_path(tokenizer_files, ".tok", model_folder)} + + if tokenizer_entry["config"] and tokenizer_entry["encodings"] and tokenizer_entry["model"]: + paths["tokenizer"] = tokenizer_entry + + if len(cwe_files) > 0: + cwe_entry = {"config": ModelStore.__get_file_path(cwe_files, ".config", model_folder), + "encodings": ModelStore.__get_file_path(cwe_files, ".encodings", model_folder), + "model": ModelStore.__get_file_path(cwe_files, ".best", model_folder)} + + if cwe_entry["config"] and cwe_entry["encodings"] and cwe_entry["model"]: + paths["cwe"] = cwe_entry + + if len(lemmatizer_files) > 0: + lemmatizer_entry = {"config": ModelStore.__get_file_path(lemmatizer_files, ".config", model_folder), + "encodings": ModelStore.__get_file_path(lemmatizer_files, ".encodings", model_folder), + "model": ModelStore.__get_file_path(lemmatizer_files, ".best", model_folder)} + + if lemmatizer_entry["config"] and lemmatizer_entry["encodings"] and lemmatizer_entry["model"]: + paths["lemmatizer"] = lemmatizer_entry + + """ + if len(tagger_files) > 0: + tagger_entry = {"config": ModelStore.__get_file_path(tagger_files, ".config", model_folder), + "encodings": ModelStore.__get_file_path(tagger_files, ".encodings", model_folder), + "sent": ModelStore.__get_file_path(tagger_files, ".sent", model_folder), + "tok": ModelStore.__get_file_path(tagger_files, ".tok", model_folder)} + + if tagger_entry["config"] and tagger_entry["encodings"] and tagger_entry["sent"] and \ + tagger_entry["tok"]: + paths["tagger"] = tagger_entry + """ + + if len(parser_files) > 0: + parser_entry = {"config": ModelStore.__get_file_path(parser_files, ".config", model_folder), + "encodings": ModelStore.__get_file_path(parser_files, ".encodings", model_folder), + "model": ModelStore.__get_file_path(parser_files, ".las", model_folder)} + + if parser_entry["config"] and parser_entry["encodings"] and parser_entry["model"]: + paths["parser"] = parser_entry + + return paths, langid + + @staticmethod + def __get_file_path(files, extension, model_folder): + """ + This function returns the abspath of the only one file from the 'files' list that has the given extension. + 'files' does not contain paths, only filenames + """ + valid_files = [] + for file in files: + if file.endswith(extension): + valid_files.append(file) + if len(valid_files)!=1: + return None + return os.path.abspath(os.path.join(model_folder, valid_files[0])) + + + @staticmethod + def _get_catalog(check_for_latest: bool = False): + local_path = os.path.join(ModelStore.root_path, "catalog.json") + + if not os.path.exists(local_path) or check_for_latest is True: + print("Catalog either does not exist or looking for updates, downloading ... ") + status_code = ModelStore.__download_file(ModelStore.catalog_url, local_path) + if status_code != 200: + raise Exception( + "Catalog download failed with status_code {}".format(status_code)) + + if not os.path.exists(local_path): + raise Exception("Sanity check failed, catalog file not found locally, though it was downloaded!") + + with open(local_path, "r", encoding="utf8") as f: + catalog = json.load(f) + + return catalog + + @staticmethod + def _get_model(model_url: str, parts: int): + if model_url.endswith("/"): # remove last /, + model_url = model_url[:-1] + + model_name = model_url.split("/")[-1] # e.g. en_partut-1.0 + model_folder = os.path.join(ModelStore.root_path, "models", model_name) + + # check if model already exists + if os.path.exists(model_folder): + if len([f for f in os.listdir(model_folder) if os.path.isfile(os.path.join(model_folder, f))])>0: + return os.path.abspath(model_folder) # model is present + # todo Sanity check that model is valid here, not in solve. + + temp_folder = os.path.join(ModelStore.root_path, str(uuid.uuid4().hex)) + os.mkdir(temp_folder) + + # download each file + print("Downloading model {} ...".format(model_name)) + current_part = 0 + for current_part in range(parts): + current_file = os.path.join(temp_folder, "{}.{}".format(model_name, current_part)) + current_url = "{}.{}".format(model_url, current_part) + status_code = ModelStore.__download_file(current_url, current_file, description=f" ... download model part {current_part+1}/{parts}") + if status_code != 200: + raise Exception(f"Error downloading file {current_url}, received status code {status_code}") + + print("Merging model parts ...") + zip_file = os.path.join(temp_folder, "archive.zip") + with open(zip_file, "wb") as f: + for i in range(parts): + with open(os.path.join(temp_folder, "{}.{}".format(model_name, i)), "rb") as r: + f.write(r.read()) + + print("Unzipping ...") + os.makedirs(model_folder, exist_ok=True) + zip = zipfile.ZipFile(zip_file, "r") + zip.extractall(model_folder) + zip.close() + + print("Cleaning up ...") + if os.path.exists(temp_folder) and os.path.isdir(temp_folder): + shutil.rmtree(temp_folder) + + # todo sanity check + print("Model downloaded successfully!") + + return os.path.abspath(model_folder) + + @staticmethod + def __download_file(url: str, filename: str, description=None): + r = requests.get(url, stream=True) + if r.status_code != 200: + raise Exception(f"Error getting {url}, received status_code {r.status_code}") + file_size = int(r.headers['Content-Length']) + chunk_size = 1024 + + with open(filename, 'wb') as fp: + with tqdm(total=file_size, unit='B', unit_scale=True, desc=description, unit_divisor=1024, disable= True if description is None else False, leave = False) as progressbar: + for chunk in r.iter_content(chunk_size=chunk_size): + if chunk is not None: + fp.write(chunk) + fp.flush() + progressbar.update(len(chunk)) + + return r.status_code + + @staticmethod + def _pack_model(input_folder: str, output_folder: str, model_name:str, split_size_in_mb: int = 50) -> int: + """ + Zips everything in input folder, splits it in output_folder with model_name., return number of parts + """ + + # zip all files in input_folder as input_folder/archive.zip + print(f"Zipping files from {input_folder}:") + zip_file_path = os.path.join(input_folder, "archive.zip") + zip = zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) + root_len = len(os.path.abspath(input_folder)) + for root, dirs, files in os.walk(input_folder): + archive_root = os.path.abspath(root)[root_len:] + for f in files: + if ".zip" in f: + continue + fullpath = os.path.join(root, f) + archive_name = os.path.join(archive_root, f) + print(f"\t adding {fullpath} ...") + zip.write(fullpath, archive_name, zipfile.ZIP_DEFLATED) + + zip.close() + + # split archive.zip in shards in output_folder + counter = 0 + with open(zip_file_path, "rb") as f: + while True: + byte_s = f.read(split_size_in_mb * 1024 * 1024) + if not byte_s: + break + chunk_file_path = os.path.join(output_folder, "{}.{}".format(model_name, counter)) + logging.info("\t writing {:.2f}MB to {} ...".format(len(byte_s) / 1024 / 1024, chunk_file_path)) + with open(chunk_file_path, "wb") as r: + r.write(byte_s) + counter += 1 + + # return number of files + return counter + + + + +# ensure we get a valid root path for local model storage +ModelStore.root_path = os.path.join(str(Path.home()), ".nlpcube") +if not os.path.exists(ModelStore.root_path): + os.makedirs(ModelStore.root_path, exist_ok=True) diff --git a/cube/io_utils/objects.py b/cube/io_utils/objects.py new file mode 100644 index 000000000..1d9819f78 --- /dev/null +++ b/cube/io_utils/objects.py @@ -0,0 +1,183 @@ +import sys + +sys.path.append('') +from cube.io_utils.misc import fopen + + +class Document: + """ + Document (document) + + A document (Document) is a collection of sentences. The structures inside are compatible with CONLL-U format. See https://universaldependencies.org/format.html for more details + + Example usage: + doc = Document(filename='corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-dev.conllu') + """ + + def __init__(self, filename: str = None, lang_id: int = 0): + """ + Create a new Document instance. + + You can pass optional arguments filename and lang_id if you want to load a UD-Style document + """ + self.sentences = [] + if filename is not None: + self.load(filename, lang_id) + + def load(self, filename: str, lang_id: int = 0): + """ + Load a CONLL-U file into the document + + Params: + filename - mandatory parameter + lang_id - optional parameter (default value=0) + + Example usage: + doc = Document() + doc.load('corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-dev.conllu', lang_id=1000) + + """ + in_sequence = False + f = fopen(filename, 'r') + seq = [] + cnt = 0 + for line in f.readlines(): + line = line.replace("\n", "") + line = line.replace("\r", "") + cnt += 1 + # if cnt == 100: + # break + if (not line.startswith("#") or in_sequence) and line != '': + parts = line.split("\t") + + s = Word(parts[0], parts[1], parts[2], parts[3], parts[4], parts[5], parts[6], parts[7], + parts[8], parts[9]) + seq.append(s) + in_sequence = True + elif line == "": + in_sequence = False + if len(seq) > 0: + self.sentences.append(Sentence(sequence=seq, lang_id=lang_id)) + seq = [] + f.close() + + def text(self): + self.__repr__() + + def __repr__(self): + return '\n\n'.join([str(s) for s in self.sentences]) + + +class Sentence: + """ + A sentence is a collection of tokens and words. This class is not meant to be initialized outside of the Document structure + """ + + def __init__(self, sequence=None, lang_id=0, text=None): + self.doc = None + self.tokens = [] + self.words = [] + + self.lang_id = lang_id + skip = 0 + t = None + if sequence is not None: + for w in sequence: + if '.' in str(w.index): + continue + if skip == 0: + t = Token(index=w.index, text=w.word, space_after=(not 'spaceafter=no' in w.space_after.lower()), + words=[]) + self.tokens.append(t) + if w.is_compound_entry: + parts = w.index.split('-') + skip = int(parts[1]) - int(parts[0]) + 1 + else: + skip = 1 + + if not w.is_compound_entry: + skip -= 1 + w.token = t + t.words.append(w) + self.words.append(w) + + if text is None: + self.text = self._detokenize() + else: + self.text = text + + def _detokenize(self): + s = [] + for t in self.tokens: + s.append(t.text) + if t.space_after: + s.append(' ') + return ''.join(s) + + def __repr__(self): + return '\n'.join([str(t) for t in self.tokens]) + + +class Token: + """ + A token contains a list of composing words. Except for Multiword Tokens, the list of words will contain a single element. + This class is not meant to be initialized outside the Document/Sentence structures. + """ + + def __init__(self, index=0, text: str = '', words=[], space_after=True): + self.index = index + self.text = text + self.words = words + self.other = None # ner, sentiment, etc + self.space_after = space_after + + def __repr__(self): + if not self.space_after: + spa = 'SpaceAfter=No' + else: + spa = '_' + head = '' + if len(self.words) > 1: + head = "\t".join([str(self.index), self.text, + '_', '_', + '_', '_', '_', '_', '_', spa]) + '\n' + return head + '\n'.join([str(w) for w in self.words]) + + +def _int_try_parse(value): + try: + return int(value), False + except ValueError: + return value, True + + +class Word: + """ + Structure to hold CONLL-U style metadata for words. See https://universaldependencies.org/format.html for more details about the format + """ + + def __init__(self, index, word: str, lemma: str, upos: str, xpos: str, attrs: str, head, label: str, deps: str, + space_after: str, token: Token = None): + self.index, self.is_compound_entry = _int_try_parse(index) + self.word = word + self.lemma = lemma + self.upos = upos + self.xpos = xpos + self.attrs = attrs + self.head, _ = _int_try_parse(head) + self.label = label + self.deps = deps + self.space_after = space_after + self.parent = token + self.emb = None + + def __repr__(self): + return "\t".join([str(self.index), self.word if isinstance(self.word, str) else self.word.encode('utf-8'), + self.lemma if isinstance(self.lemma, str) else self.lemma.encode('utf-8'), self.upos, + self.xpos, self.attrs, str(self.head), self.label, self.deps, self.space_after]) + + +if __name__ == '__main__': + print("test") + doc = Document(filename='corpus/ud-treebanks-v2.7/UD_French-GSD/fr_gsd-ud-dev.conllu') + print(doc) diff --git a/cube/networks/__init__.py b/cube/networks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cube/networks/compound.py b/cube/networks/compound.py new file mode 100644 index 000000000..ad16ce976 --- /dev/null +++ b/cube/networks/compound.py @@ -0,0 +1,338 @@ +import sys +from typing import * + +import torch +import torch.nn as nn +import torch.nn.functional as F +import pytorch_lightning as pl +import numpy as np +from torch.utils.data import DataLoader + +from cube.io_utils.objects import Document +from cube.networks.utils import Word2TargetCollate, CompoundDataset + +sys.path.append('') +from cube.io_utils.encodings import Encodings +from cube.io_utils.config import CompoundConfig +from cube.networks.modules import LinearNorm, ConvNorm, Attention + + +class Compound(pl.LightningModule): + encodings: Encodings + config: CompoundConfig + + def __init__(self, config: CompoundConfig, encodings: Encodings, language_codes: [] = None): + super(Compound, self).__init__() + NUM_FILTERS = 512 + NUM_LAYERS = 5 + self._config = config + self._encodings = encodings + self._num_languages = encodings.num_langs + self._language_codes = language_codes + self._eol = len(encodings.char2int) + self._num_filters = NUM_FILTERS + + self._char_list = ['' for char in encodings.char2int] + for char in encodings.char2int: + self._char_list[encodings.char2int[char]] = char + self._lang_emb = nn.Embedding(self._num_languages + 1, config.lang_emb_size, padding_idx=0) + self._char_emb = nn.Embedding(len(encodings.char2int) + 2, config.char_emb_size, + padding_idx=0) # start/stop index + self._case_emb = nn.Embedding(4, 16, padding_idx=0) # 0-pad 1-symbol 2-upper 3-lower + convolutions = [] + cs_inp = config.char_emb_size + config.lang_emb_size + 16 + + for _ in range(NUM_LAYERS): + conv_layer = nn.Sequential( + ConvNorm(cs_inp, + NUM_FILTERS, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(NUM_FILTERS)) + convolutions.append(conv_layer) + cs_inp = NUM_FILTERS // 2 + config.lang_emb_size + + self._convolutions_char = nn.ModuleList(convolutions) + self._decoder = nn.LSTM( + NUM_FILTERS // 2 + config.char_emb_size + config.lang_emb_size + 16, + config.decoder_size, config.decoder_layers, + batch_first=True, bidirectional=False) + self._attention = Attention( + (NUM_FILTERS // 2 + config.lang_emb_size + 16) // 2, + config.decoder_size, config.att_proj_size) + + self._output_char = LinearNorm(config.decoder_size, len(self._encodings.char2int) + 2) + self._output_case = LinearNorm(config.decoder_size, 4) + self._start_frame = nn.Embedding(1, + NUM_FILTERS // 2 + config.char_emb_size + config.lang_emb_size + 16) + + if self._language_codes: + self._res = {} + for language_code in self._language_codes: + self._res[language_code] = {"loss": 0., "acc": 0.} + self._early_stop_meta_val = 0 + self._epoch_results = None + + def forward(self, X): + x_char = X['x_char'] + x_case = X['x_case'] + x_lang = X['x_lang'] + x_upos = X['x_upos'] + + if 'y_char' in X: + gs_output = X['y_char'] + else: + gs_output = None + + char_emb = self._char_emb(x_char) + case_emb = self._case_emb(x_case) + + lang_emb = self._lang_emb(x_lang).unsqueeze(1).repeat(1, char_emb.shape[1], 1) + conditioning = case_emb + if gs_output is not None: + output_idx = gs_output + + x = torch.cat((char_emb, conditioning), dim=-1) + half = self._num_filters // 2 + count = 0 + res = None + skip = None + x_lang_conv = lang_emb.permute(0, 2, 1) + x = x.permute(0, 2, 1) + for conv in self._convolutions_char: + count += 1 + drop = self.training + if count >= len(self._convolutions_char): + drop = False + if skip is not None: + x = x + skip + + x = torch.cat([x, x_lang_conv], dim=1) + conv_out = conv(x) + tmp = torch.tanh(conv_out[:, :half, :]) * torch.sigmoid((conv_out[:, half:, :])) + if res is None: + res = tmp + else: + res = res + tmp + skip = tmp + x = torch.dropout(tmp, 0.1, drop) + x = x + res + x = x.permute(0, 2, 1) + encoder_output = torch.cat((x, conditioning, lang_emb), dim=-1) + + step = 0 + done = np.zeros(encoder_output.shape[0]) + start_frame = self._start_frame( + torch.tensor([0], dtype=torch.long, device=self._get_device())).unsqueeze(1).repeat(encoder_output.shape[0], + 1, 1) + decoder_output, decoder_hidden = self._decoder(start_frame) + + out_char_list = [] + out_case_list = [] + while True: + if gs_output is not None: + if step == output_idx.shape[1]: + break + elif np.sum(done) == encoder_output.shape[0]: + break + elif step == encoder_output.shape[1] * 20: # failsafe + break + + att = self._attention(decoder_hidden[-1][-1, :, :], encoder_output) + context = torch.bmm(att.unsqueeze(1), encoder_output) + + if step == 0: + prev_char_emb = torch.zeros((encoder_output.shape[0], 1, self._config.char_emb_size), + device=self._get_device()) + + decoder_input = torch.cat((context, prev_char_emb), dim=-1) + decoder_output, decoder_hidden = self._decoder(decoder_input, + hx=(torch.dropout(decoder_hidden[0], 0.5, self.training), + torch.dropout(decoder_hidden[1], 0.5, self.training))) + + output_char = self._output_char(decoder_output) + output_case = self._output_case(decoder_output) + out_char_list.append(output_char) + out_case_list.append(output_case) + selected_chars = torch.argmax(output_char, dim=-1) + for ii in range(selected_chars.shape[0]): + if selected_chars[ii].squeeze() == self._eol: + done[ii] = 1 + if gs_output is not None: + prev_char_emb = self._char_emb(output_idx[:, step]).unsqueeze(1) + else: + prev_char_emb = self._char_emb(selected_chars) + + step += 1 + + return torch.cat(out_char_list, dim=1), torch.cat(out_case_list, dim=1) + + def save(self, path): + torch.save(self.state_dict(), path) + + def load(self, model_path: str, device: str = 'cpu'): + self.load_state_dict(torch.load(model_path, map_location='cpu')['state_dict']) + self.to(device) + + def _get_device(self): + if self._char_emb.weight.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(self._char_emb.weight.device.type, str(self._char_emb.weight.device.index)) + + def process(self, doc: Document, collate: Word2TargetCollate, batch_size: int = 4, + num_workers: int = 4) -> Document: + self.eval() + dataset = CompoundDataset(doc, for_training=False) + dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate.collate_fn, + shuffle=False, num_workers=num_workers) + + data_iterator = iter(dataloader) + + end_char_value = len(self._encodings.char2int) + + with torch.no_grad(): + all_lemmas = [] + for batch in dataloader: + del batch['y_char'] # set for prediction, not training + del batch['y_case'] + + for key in batch: + if isinstance(batch[key], torch.Tensor): + batch[key] = batch[key].to(self._device) + + y_char_pred, y_case_pred = self.forward(batch) + y_char_pred = torch.argmax(y_char_pred.detach(), dim=-1).cpu().numpy() # list of lists of int + y_case_pred = torch.argmax(y_case_pred.detach(), dim=-1).cpu().numpy() # list of lists of int + for word_index in range(y_char_pred.shape[0]): + # get letters + lemma = [] + for char_val, case_val in zip(y_char_pred[word_index], + y_case_pred[word_index]): # [[24, 12, 88]], get the inside list + if char_val == end_char_value: + break + chr = self._encodings.characters[char_val] + if case_val == 2: + chr = chr.upper() + elif case_val == 3: + chr = chr.lower() + lemma.append(chr) + + all_lemmas.append("".join(lemma)) + compound_index = 0 + doc_new = Document() + for sentence_index in range(len(doc.sentences)): + seq = [] + cnt = 1 + from cube.io_utils.objects import Word, Sentence + + for word_index in range(len(doc.sentences[sentence_index].words)): + spaceafter = doc.sentences[sentence_index].words[word_index].space_after.replace(';compund', '') + w = doc.sentences[sentence_index].words[word_index].word + + seq.append(Word(cnt, w, '_', '_', '_', '_', 0, '_', '_', spaceafter)) + if ';compund' in doc.sentences[sentence_index].words[word_index].space_after: + parts = all_lemmas[compound_index].split(' ') + seq[-1].index = '{0}-{1}'.format(cnt, cnt + len(parts) - 1) + seq[-1].is_compound_entry = True + for p in parts: + seq.append(Word(cnt, p, '_', '_', '_', '_', 0, '_', '_', '_')) + cnt += 1 + else: + cnt += 1 + compound_index += 1 + doc_new.sentences.append(Sentence(seq, lang_id=doc.sentences[sentence_index].lang_id)) + return doc_new + + def configure_optimizers(self): + return torch.optim.AdamW(self.parameters()) + + def training_step(self, batch, batch_idx): + y_char_pred, y_case_pred = self.forward(batch) + y_char_target, y_case_target = batch['y_char'], batch['y_case'] + loss_char = F.cross_entropy(y_char_pred.view(-1, y_char_pred.shape[2]), y_char_target.view(-1), ignore_index=0) + loss_case = F.cross_entropy(y_case_pred.view(-1, y_case_pred.shape[2]), y_case_target.view(-1), ignore_index=0) + return loss_char + loss_case + + def validation_step(self, batch, batch_idx): + y_char_target, y_case_target = batch['y_char'], batch['y_case'] + del batch['y_char'] + y_char_pred, y_case_pred = self.forward(batch) + language_result = {lang_id: {'total': 0, 'ok': 0} + for lang_id in range(self._num_languages)} + + y_char_target = y_char_target.detach().cpu().numpy() + y_char_pred = torch.argmax(y_char_pred.detach(), dim=-1).cpu().numpy() + lang = batch['x_lang'].detach().cpu().numpy() + for lang_id, y_pred, y_target in zip(lang, y_char_pred, y_char_target): + valid = True + for y_p, y_t in zip(y_pred, y_target): + if y_t != 0 and y_p != y_t: + valid = False + break + if valid: + language_result[lang_id - 1]['ok'] += 1 + language_result[lang_id - 1]['total'] += 1 + + return {'acc': language_result} + + def validation_epoch_end(self, outputs: List[Any]) -> None: + language_result = {lang_id: {'total': 0, 'ok': 0} + for lang_id in + range(self._num_languages)} + for result in outputs: + for lang_id in result['acc']: + language_result[lang_id]['ok'] += result['acc'][lang_id]['ok'] + language_result[lang_id]['total'] += result['acc'][lang_id]['total'] + + res = {} + for lang_index in language_result: + total = language_result[lang_index]['total'] + if total == 0: + total = 1 + if self._language_codes is None: + lang = lang_index + else: + lang = self._language_codes[lang_index] + res[lang] = { + "acc": language_result[lang_index]['ok'] / total, + } + + self.log('val/ACC/{0}'.format(lang), language_result[lang_index]['ok'] / total) + + # single value for early stopping + self._epoch_results = self._compute_early_stop(res) + self.log('val/early_meta', self._early_stop_meta_val) + + def _compute_early_stop(self, res): + for lang in res: + if res[lang]["acc"] > self._res[lang]["acc"]: + self._early_stop_meta_val += 1 + self._res[lang]["acc"] = res[lang]["acc"] + res[lang]["acc_best"] = True + return res + + class PrintAndSaveCallback(pl.callbacks.Callback): + def __init__(self, store_prefix): + super().__init__() + self.store_prefix = store_prefix + + def on_validation_end(self, trainer, pl_module): + metrics = trainer.callback_metrics + epoch = trainer.current_epoch + + for lang in pl_module._epoch_results: + res = pl_module._epoch_results[lang] + if "acc_best" in res: + trainer.save_checkpoint(self.store_prefix + "." + lang + ".best") + + trainer.save_checkpoint(self.store_prefix + ".last") + + s = "{0:30s}\tACC".format("Language") + print("\n\n\t" + s) + print("\t" + ("=" * (len(s) + 16))) + for lang in pl_module._language_codes: + acc = metrics["val/ACC/{0}".format(lang)] + msg = "\t{0:30s}:\t{1:.4f}".format(lang, acc) + print(msg) + print("\n") diff --git a/cube/networks/lemmatizer.py b/cube/networks/lemmatizer.py new file mode 100644 index 000000000..149889062 --- /dev/null +++ b/cube/networks/lemmatizer.py @@ -0,0 +1,324 @@ +import sys +from typing import * + +import torch +import torch.nn as nn +import torch.nn.functional as F +import pytorch_lightning as pl +import numpy as np +from torch.utils.data import DataLoader + +from cube.io_utils.objects import Document +from cube.networks.utils import LemmaDataset, Word2TargetCollate + +sys.path.append('') +from cube.io_utils.encodings import Encodings +from cube.io_utils.config import LemmatizerConfig +from cube.networks.modules import LinearNorm, ConvNorm, Attention + + +class Lemmatizer(pl.LightningModule): + encodings: Encodings + config: LemmatizerConfig + + def __init__(self, config: LemmatizerConfig, encodings: Encodings, language_codes: [] = None): + super(Lemmatizer, self).__init__() + NUM_FILTERS = 512 + NUM_LAYERS = 5 + self._config = config + self._encodings = encodings + self._num_languages = encodings.num_langs + self._language_codes = language_codes + self._eol = len(encodings.char2int) + self._num_filters = NUM_FILTERS + + self._char_list = ['' for char in encodings.char2int] + for char in encodings.char2int: + self._char_list[encodings.char2int[char]] = char + self._lang_emb = nn.Embedding(self._num_languages + 1, config.lang_emb_size, padding_idx=0) + self._upos_emb = nn.Embedding(len(encodings.upos2int), config.upos_emb_size, padding_idx=0) + self._char_emb = nn.Embedding(len(encodings.char2int) + 2, config.char_emb_size, + padding_idx=0) # start/stop index + self._case_emb = nn.Embedding(4, 16, padding_idx=0) # 0-pad 1-symbol 2-upper 3-lower + convolutions = [] + cs_inp = config.char_emb_size + config.lang_emb_size + config.upos_emb_size + 16 + + for _ in range(NUM_LAYERS): + conv_layer = nn.Sequential( + ConvNorm(cs_inp, + NUM_FILTERS, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(NUM_FILTERS)) + convolutions.append(conv_layer) + cs_inp = NUM_FILTERS // 2 + config.lang_emb_size + + self._convolutions_char = nn.ModuleList(convolutions) + self._decoder = nn.LSTM( + NUM_FILTERS // 2 + config.char_emb_size + config.lang_emb_size + config.upos_emb_size + 16, + config.decoder_size, config.decoder_layers, + batch_first=True, bidirectional=False) + self._attention = Attention( + (NUM_FILTERS // 2 + config.lang_emb_size + config.upos_emb_size + 16) // 2, + config.decoder_size, config.att_proj_size) + + self._output_char = LinearNorm(config.decoder_size, len(self._encodings.char2int) + 2) + self._output_case = LinearNorm(config.decoder_size, 4) + self._start_frame = nn.Embedding(1, + NUM_FILTERS // 2 + config.char_emb_size + config.lang_emb_size + config.upos_emb_size + 16) + + if self._language_codes: + self._res = {} + for language_code in self._language_codes: + self._res[language_code] = {"loss": 0., "acc": 0.} + self._early_stop_meta_val = 0 + self._epoch_results = None + + def forward(self, X): + x_char = X['x_char'] + x_case = X['x_case'] + x_lang = X['x_lang'] + x_upos = X['x_upos'] + + if 'y_char' in X: + gs_output = X['y_char'] + else: + gs_output = None + + char_emb = self._char_emb(x_char) + case_emb = self._case_emb(x_case) + + upos_emb = self._upos_emb(x_upos).unsqueeze(1).repeat(1, char_emb.shape[1], 1) + lang_emb = self._lang_emb(x_lang).unsqueeze(1).repeat(1, char_emb.shape[1], 1) + conditioning = torch.cat((case_emb, upos_emb), dim=-1) + if gs_output is not None: + output_idx = gs_output + + x = torch.cat((char_emb, conditioning), dim=-1) + half = self._num_filters // 2 + count = 0 + res = None + skip = None + x_lang_conv = lang_emb.permute(0, 2, 1) + x = x.permute(0, 2, 1) + for conv in self._convolutions_char: + count += 1 + drop = self.training + if count >= len(self._convolutions_char): + drop = False + if skip is not None: + x = x + skip + + x = torch.cat([x, x_lang_conv], dim=1) + conv_out = conv(x) + tmp = torch.tanh(conv_out[:, :half, :]) * torch.sigmoid((conv_out[:, half:, :])) + if res is None: + res = tmp + else: + res = res + tmp + skip = tmp + x = torch.dropout(tmp, 0.1, drop) + x = x + res + x = x.permute(0, 2, 1) + encoder_output = torch.cat((x, conditioning, lang_emb), dim=-1) + + step = 0 + done = np.zeros(encoder_output.shape[0]) + start_frame = self._start_frame( + torch.tensor([0], dtype=torch.long, device=self._get_device())).unsqueeze(1).repeat(encoder_output.shape[0], + 1, 1) + decoder_output, decoder_hidden = self._decoder(start_frame) + + out_char_list = [] + out_case_list = [] + while True: + if gs_output is not None: + if step == output_idx.shape[1]: + break + elif np.sum(done) == encoder_output.shape[0]: + break + elif step == encoder_output.shape[1] * 20: # failsafe + break + + att = self._attention(decoder_hidden[-1][-1, :, :], encoder_output) + context = torch.bmm(att.unsqueeze(1), encoder_output) + + if step == 0: + prev_char_emb = torch.zeros((encoder_output.shape[0], 1, self._config.char_emb_size), + device=self._get_device()) + + decoder_input = torch.cat((context, prev_char_emb), dim=-1) + decoder_output, decoder_hidden = self._decoder(decoder_input, + hx=(torch.dropout(decoder_hidden[0], 0.5, self.training), + torch.dropout(decoder_hidden[1], 0.5, self.training))) + + output_char = self._output_char(decoder_output) + output_case = self._output_case(decoder_output) + out_char_list.append(output_char) + out_case_list.append(output_case) + selected_chars = torch.argmax(output_char, dim=-1) + for ii in range(selected_chars.shape[0]): + if selected_chars[ii].squeeze() == self._eol: + done[ii] = 1 + if gs_output is not None: + prev_char_emb = self._char_emb(output_idx[:, step]).unsqueeze(1) + else: + prev_char_emb = self._char_emb(selected_chars) + + step += 1 + + return torch.cat(out_char_list, dim=1), torch.cat(out_case_list, dim=1) + + def save(self, path): + torch.save(self.state_dict(), path) + + def load(self, model_path: str, device: str = 'cpu'): + self.load_state_dict(torch.load(model_path, map_location='cpu')['state_dict']) + self.to(device) + + def _get_device(self): + if self._char_emb.weight.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(self._char_emb.weight.device.type, str(self._char_emb.weight.device.index)) + + def process(self, doc: Document, collate: Word2TargetCollate, batch_size: int = 4, + num_workers: int = 4) -> Document: + self.eval() + dataset = LemmaDataset(doc, for_training=False) + + dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate.collate_fn, + shuffle=False, num_workers=num_workers) + + data_iterator = iter(dataloader) + + end_char_value = len(self._encodings.char2int) + + with torch.no_grad(): + all_lemmas = [] + for batch in dataloader: + del batch['y_char'] # set for prediction, not training + del batch['y_case'] + + for key in batch: + if isinstance(batch[key], torch.Tensor): + batch[key] = batch[key].to(self._device) + + y_char_pred, y_case_pred = self.forward(batch) + y_char_pred = torch.argmax(y_char_pred.detach(), dim=-1).cpu().numpy() # list of lists of int + y_case_pred = torch.argmax(y_case_pred.detach(), dim=-1).cpu().numpy() # list of lists of int + for word_index in range(y_char_pred.shape[0]): + # get letters + lemma = [] + for char_val, case_val in zip(y_char_pred[word_index], + y_case_pred[word_index]): # [[24, 12, 88]], get the inside list + if char_val == end_char_value: + break + chr = self._encodings.characters[char_val] + if case_val == 2: + chr = chr.upper() + elif case_val == 3: + chr = chr.lower() + lemma.append(chr) + + all_lemmas.append("".join(lemma)) + lemma_index = 0 + for sentence_index in range(len(doc.sentences)): + for word_index in range(len(doc.sentences[sentence_index].words)): + doc.sentences[sentence_index].words[word_index].lemma = all_lemmas[lemma_index] + lemma_index += 1 + + return doc + + def configure_optimizers(self): + return torch.optim.AdamW(self.parameters()) + + def training_step(self, batch, batch_idx): + y_char_pred, y_case_pred = self.forward(batch) + y_char_target, y_case_target = batch['y_char'], batch['y_case'] + loss_char = F.cross_entropy(y_char_pred.view(-1, y_char_pred.shape[2]), y_char_target.view(-1), ignore_index=0) + loss_case = F.cross_entropy(y_case_pred.view(-1, y_case_pred.shape[2]), y_case_target.view(-1), ignore_index=0) + return loss_char + loss_case + + def validation_step(self, batch, batch_idx): + y_char_target, y_case_target = batch['y_char'], batch['y_case'] + del batch['y_char'] + y_char_pred, y_case_pred = self.forward(batch) + language_result = {lang_id: {'total': 0, 'ok': 0} + for lang_id in range(self._num_languages)} + + y_char_target = y_char_target.detach().cpu().numpy() + y_char_pred = torch.argmax(y_char_pred.detach(), dim=-1).cpu().numpy() + lang = batch['x_lang'].detach().cpu().numpy() + for lang_id, y_pred, y_target in zip(lang, y_char_pred, y_char_target): + valid = True + for y_p, y_t in zip(y_pred, y_target): + if y_t != 0 and y_p != y_t: + valid = False + break + if valid: + language_result[lang_id - 1]['ok'] += 1 + language_result[lang_id - 1]['total'] += 1 + + return {'acc': language_result} + + def validation_epoch_end(self, outputs: List[Any]) -> None: + language_result = {lang_id: {'total': 0, 'ok': 0} + for lang_id in + range(self._num_languages)} + for result in outputs: + for lang_id in result['acc']: + language_result[lang_id]['ok'] += result['acc'][lang_id]['ok'] + language_result[lang_id]['total'] += result['acc'][lang_id]['total'] + + res = {} + for lang_index in language_result: + total = language_result[lang_index]['total'] + if total == 0: + total = 1 + if self._language_codes is None: + lang = lang_index + else: + lang = self._language_codes[lang_index] + res[lang] = { + "acc": language_result[lang_index]['ok'] / total, + } + + self.log('val/ACC/{0}'.format(lang), language_result[lang_index]['ok'] / total) + + # single value for early stopping + self._epoch_results = self._compute_early_stop(res) + self.log('val/early_meta', self._early_stop_meta_val) + + def _compute_early_stop(self, res): + for lang in res: + if res[lang]["acc"] > self._res[lang]["acc"]: + self._early_stop_meta_val += 1 + self._res[lang]["acc"] = res[lang]["acc"] + res[lang]["acc_best"] = True + return res + + class PrintAndSaveCallback(pl.callbacks.Callback): + def __init__(self, store_prefix): + super().__init__() + self.store_prefix = store_prefix + + def on_validation_end(self, trainer, pl_module): + metrics = trainer.callback_metrics + epoch = trainer.current_epoch + + for lang in pl_module._epoch_results: + res = pl_module._epoch_results[lang] + if "acc_best" in res: + trainer.save_checkpoint(self.store_prefix + "." + lang + ".best") + + trainer.save_checkpoint(self.store_prefix + ".last") + + s = "{0:30s}\tACC".format("Language") + print("\n\n\t" + s) + print("\t" + ("=" * (len(s) + 16))) + for lang in pl_module._language_codes: + acc = metrics["val/ACC/{0}".format(lang)] + msg = "\t{0:30s}:\t{1:.4f}".format(lang, acc) + print(msg) + print("\n") diff --git a/cube/networks/lm.py b/cube/networks/lm.py new file mode 100644 index 000000000..8fb1d358e --- /dev/null +++ b/cube/networks/lm.py @@ -0,0 +1,217 @@ +import sys +import tqdm +from abc import abstractmethod + +import torch +import numpy as np +from typing import * + +sys.path.append('') +from transformers import AutoTokenizer +from transformers import AutoModel +from cube.io_utils.objects import Sentence, Document +import fasttext +import fasttext.util + + +class LMHelper: + def __init__(self): + pass + + @abstractmethod + def get_embedding_size(self): + pass + + @abstractmethod + def apply(self, document: Document): + pass + + @abstractmethod + def apply_raw(self, batch): + pass + + +class LMHelperFT(LMHelper): + def __init__(self, device: str = 'cpu', model: str = None): + from pathlib import Path + home = str(Path.home()) + filename = '{0}/.fasttext/cc.{1}.300.bin'.format(home, model) + import os + if not os.path.exists(filename): + fasttext.util.download_model(model, if_exists='ignore') # English + in_file = "cc.{0}.300.bin".format(model) + import shutil + import pathlib + print("Creating " + "{0}/.fasttext/".format(home)) + pathlib.Path("{0}/.fasttext/".format(home)).mkdir(parents=True, exist_ok=True) + shutil.move(in_file, filename) + self._fasttext = fasttext.load_model(filename) + + def get_embedding_size(self): + return [300] + + def apply(self, document: Document): + for ii in tqdm.tqdm(range(len(document.sentences)), desc="Pre-computing embeddings", unit="sent"): + for jj in range(len(document.sentences[ii].words)): + document.sentences[ii].words[jj].emb = [self._fasttext.get_word_vector( + document.sentences[ii].words[jj].word)] + + def apply_raw(self, batch): + embeddings = [] + for ii in range(len(batch)): + c_emb = [] + for jj in range(len(batch[ii])): + c_emb.append(self._fasttext.get_word_vector(batch[ii][jj])) + embeddings.append(c_emb) + return embeddings + + +class LMHelperLanguasito(LMHelper): + def __init__(self, device: str = 'cpu', model: str = None): + if model is None: + print("UserWarning: No languasito model was specified. Instance will fail") + from languasito.api import LanguasitoAPI + self._languasito = LanguasitoAPI.load(model) + self._languasito.to(device) + + def get_embedding_size(self): + # TODO: a better way to get the embedding size (right now it is hardcoded) + return [1024] + + def apply(self, document: Document): + BATCH_SIZE = 8 + num_batches = len(document.sentences) // BATCH_SIZE + if len(document.sentences) % BATCH_SIZE != 0: + num_batches += 1 + + for iBatch in tqdm.tqdm(range(num_batches), desc="Pre-computing embeddings", unit="sent"): + start = iBatch * BATCH_SIZE + stop = min(iBatch * BATCH_SIZE + BATCH_SIZE, len(document.sentences)) + batch = [] + for ii in range(start, stop): + cb = [] + for w in document.sentences[ii].words: + cb.append(w.word) + batch.append(cb) + embeddings = self._languasito(batch) + for ii in range(len(batch)): + for jj in range(len(batch[ii])): + document.sentences[ii + start].words[jj].emb = [embeddings[ii][jj]] + + def apply_raw(self, batch): + BATCH_SIZE = 8 + num_batches = len(batch) // BATCH_SIZE + if len(batch) % BATCH_SIZE != 0: + num_batches += 1 + + for iBatch in range(num_batches): + start = iBatch * BATCH_SIZE + stop = min(iBatch * BATCH_SIZE + BATCH_SIZE, len(batch)) + tb = [] + for ii in range(start, stop): + cb = [] + for w in batch[ii]: + cb.append(w) + tb.append(cb) + embeddings = self._languasito(batch) + + return embeddings + + +class LMHelperHF(LMHelper): + def __init__(self, device: str = 'cpu', model: str = None): + if model is None: + self._splitter = AutoTokenizer.from_pretrained('xlm-roberta-base') + self._xlmr = AutoModel.from_pretrained('xlm-roberta-base', + output_hidden_states=True) + else: + self._splitter = AutoTokenizer.from_pretrained(model) + self._xlmr = AutoModel.from_pretrained(model, output_hidden_states=True) + self._xlmr.eval() + self._xlmr.to(device) + self._device = device + tmp = self._xlmr(torch.tensor([[100]], device=device)) + h_state_size = tmp['hidden_states'][0].shape[-1] + self._emb_size = [h_state_size for _ in range(len(tmp['hidden_states']))] + + def get_embedding_size(self): + # TODO: a better way to get the embedding size (right now it is hardcoded) + return self._emb_size + + def _compute_we(self, batch: [Sentence]): + # XML-Roberta + + # convert all words into wordpiece indices + word2pieces = {} + new_sents = [] + START = 0 + PAD = 1 + END = 2 + for ii in range(len(batch)): + c_sent = [START] + pos = 1 + for jj in range(len(batch[ii].words)): + word = batch[ii].words[jj].word + pieces = self._splitter(word)['input_ids'][1:-1] + word2pieces[(ii, jj)] = [] + for piece in pieces: + c_sent.append(piece) + word2pieces[(ii, jj)].append([ii, pos]) + pos += 1 + c_sent.append(END) + new_sents.append(c_sent) + max_len = max([len(s) for s in new_sents]) + input_ids = np.ones((len(new_sents), max_len), dtype=np.long) * PAD # pad everything + for ii in range(input_ids.shape[0]): + for jj in range(input_ids.shape[1]): + if jj < len(new_sents[ii]): + input_ids[ii, jj] = new_sents[ii][jj] + with torch.no_grad(): + x = torch.tensor(input_ids, device=self._device) + max_s_len = x.shape[1] + count = max_s_len // 512 + + if max_s_len % 512 != 0: + count += 1 + we_list = [] + for index in range(count): + out = self._xlmr(x[:, index * 512:min(x.shape[1], index * 512 + 512)], return_dict=True) + we = torch.cat(out['hidden_states'], dim=-1).detach().cpu() + we_list.append(we) + we = torch.cat(we_list, dim=1).numpy() + + word_emb = [] + for ii in range(len(batch)): + for jj in range(len(batch[ii].words)): + pieces = word2pieces[ii, jj] + if len(pieces) != 0: + m = we[pieces[0][0], pieces[0][1]] + for zz in range(len(pieces) - 1): + m += we[pieces[zz][0], pieces[zz][1]] + m = m / len(pieces) + else: + m = np.zeros((768 * 13), dtype=np.float) + word_emb.append(m) + # word_emb = torch.cat(word_emb, dim=0) + + return word_emb + + def apply(self, doc: Document): + import tqdm + for sent in doc.sentences: # tqdm.tqdm(doc.sentences, desc="Pre-computing embeddings", unit="sent"): + wemb = self._compute_we([sent]) + for ii in range(len(wemb)): + ww = wemb[ii] + www = [] + for kk in range(13): + www.append(ww[kk * 768:kk * 768 + 768]) + sent.words[ii].emb = www + + def apply_raw(self, batch): + pass + + +if __name__ == "__main__": + from ipdb import set_trace + + set_trace() diff --git a/cube/networks/modules.py b/cube/networks/modules.py new file mode 100644 index 000000000..09992c3c4 --- /dev/null +++ b/cube/networks/modules.py @@ -0,0 +1,578 @@ +# +# Author: Tiberiu Boros +# +# Copyright (c) 2019 Adobe Systems Incorporated. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import torch.nn as nn +import torch.nn.functional as F +import random +from torch.nn.utils.rnn import PackedSequence +from typing import * +import pytorch_lightning as pl + + +class LinearNorm(pl.LightningModule): + def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): + super(LinearNorm, self).__init__() + self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias) + + torch.nn.init.xavier_normal_( + self.linear_layer.weight, + gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, x): + return self.linear_layer(x) + + +class MLP(nn.Module): + def __init__(self, in_dim, out_dim, hid_dim=500, dropout=0.33, hid_func=torch.tanh): + super().__init__() + self._h1 = LinearNorm(in_dim, hid_dim) + self._h2 = LinearNorm(hid_dim, out_dim) + self._dropout = dropout + self._hid_func = hid_func + + def forward(self, x): + h = torch.dropout(self._hid_func(self._h1(x)), self._dropout, self.training) + o = self._h2(h) + return o + + +class Encoder(pl.LightningModule): + def __init__(self, input_type, input_size, input_emb_dim, enc_hid_dim, dropout, nn_type=nn.GRU, + num_layers=2, ext_conditioning=0): + super().__init__() + assert (input_type == 'int' or input_type == 'float') + if input_type == 'float': + assert (input_size == input_emb_dim) + self.input_type = input_type + self.input_dim = input_size + self.emb_dim = input_emb_dim + self.enc_hid_dim = enc_hid_dim + + if self.input_type == 'int': + self.embedding = nn.Sequential(nn.Embedding(input_size, input_emb_dim), nn.Dropout(dropout)) + else: + self.embedding = nn.Dropout(dropout) + if nn_type == VariationalLSTM: + self.rnn = nn_type(input_emb_dim + ext_conditioning, enc_hid_dim, bidirectional=True, num_layers=1, + dropoutw=dropout, dropouto=dropout, dropouti=dropout, batch_first=True) + self.dropout = nn.Identity() + else: + self.rnn = nn_type(input_emb_dim + ext_conditioning, enc_hid_dim, bidirectional=True, num_layers=1, + batch_first=True) + self.dropout = nn.Dropout(dropout) + + if num_layers > 1: + top_layers = [] + for ii in range(num_layers - 1): + if nn_type == VariationalLSTM: + top_layers.append( + nn_type(enc_hid_dim * 2 + ext_conditioning, enc_hid_dim, bidirectional=True, num_layers=1, + batch_first=True, dropoutw=dropout, dropouto=dropout, dropouti=dropout)) + else: + top_layers.append( + nn_type(enc_hid_dim * 2 + ext_conditioning, enc_hid_dim, bidirectional=True, num_layers=1, + batch_first=True)) + + self.top_layers = nn.ModuleList(top_layers) + else: + self.top_layers = None + + def forward(self, src, conditioning=None): + embedded = self.embedding(src) + + if conditioning is not None: + # conditioning = conditioning.permute(0, 1) + conditioning = conditioning.unsqueeze(1) + conditioning = conditioning.repeat(1, src.shape[1], 1) + embedded = torch.cat((embedded, conditioning), dim=2) + + outputs, hidden = self.rnn(embedded) + if self.top_layers is not None: + for rnn_layer in self.top_layers: + if conditioning is not None: + outputs, hidden = rnn_layer(torch.cat((self.dropout(outputs), conditioning), dim=2)) + else: + outputs, hidden = rnn_layer(self.dropout(outputs)) + if isinstance(hidden, list) or isinstance(hidden, tuple): # we have a LSTM + hidden = hidden[1] + hidden = torch.cat((hidden[-1, :, :], hidden[0, :, :]), dim=1) + + return outputs, hidden + + +class BilinearAttention(nn.Module): + def __init__(self, dim1, dim2): + super().__init__() + self.biliniar = nn.Bilinear(dim1, dim2, 1) + self.linear = nn.Linear(dim1 + dim2, 1) + + def forward(self, query, keys): + query = query.unsqueeze(1).repeat(1, keys.shape[1], 1) + biliniar = self.biliniar(query, keys) + h = torch.cat([query, keys], dim=-1) + liniar = self.linear(h) + return (liniar + biliniar).squeeze(2) + + +class Attention(pl.LightningModule): + def __init__(self, enc_hid_dim, dec_hid_dim, att_proj_size=100): + super().__init__() + + self.enc_hid_dim = enc_hid_dim + self.dec_hid_dim = dec_hid_dim + + # self.attn = LinearNorm((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim) + self.attn = ConvNorm(enc_hid_dim * 2 + dec_hid_dim, att_proj_size, kernel_size=5, + w_init_gain='tanh') + self.v = nn.Parameter(torch.rand(att_proj_size)) + + def forward(self, hidden, encoder_outputs, return_logsoftmax=False): + # hidden = [batch size, dec hid dim] + # encoder_outputs = [src sent len, batch size, enc hid dim * 2] + batch_size = encoder_outputs.shape[0] + src_len = encoder_outputs.shape[1] + # repeat encoder hidden state src_len times + hidden = hidden.unsqueeze(1).repeat(1, src_len, 1) + encoder_outputs = encoder_outputs + # hidden = [batch size, src sent len, dec hid dim] + # encoder_outputs = [batch size, src sent len, enc hid dim * 2] + energy = torch.dropout( + torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2).transpose(1, 2)).transpose(1, 2)), 0.1, + self.training) + energy = energy.transpose(1, 2) + # energy = [batch size, src sent len, dec hid dim] + # energy = [batch size, dec hid dim, src sent len] + # v = [dec hid dim] + v = self.v.repeat(batch_size, 1).unsqueeze(1) + # v = [batch size, 1, dec hid dim] + attention = torch.bmm(v, energy).squeeze(1) + # attention= [batch size, src len] + if return_logsoftmax: + return F.log_softmax(attention, dim=1) + else: + return F.softmax(attention, dim=1) + + +class Decoder(pl.LightningModule): + def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention, nn_type=nn.GRU, num_layers=2): + super().__init__() + + self.emb_dim = emb_dim + self.enc_hid_dim = enc_hid_dim + self.dec_hid_dim = dec_hid_dim + self.output_dim = output_dim + self.dropout = dropout + self.attention = attention + + self.embedding = nn.Embedding(output_dim, emb_dim) + self.rnn = nn_type((enc_hid_dim * 2) + emb_dim, dec_hid_dim, num_layers=num_layers) + self.out = LinearNorm((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim) + self.dropout = nn.Dropout(dropout) + + def forward(self, input, hidden, encoder_outputs): + input = input.unsqueeze(0) + embedded = self.dropout(self.embedding(input)) + a = self.attention(hidden, encoder_outputs) + a = a.unsqueeze(1) + encoder_outputs = encoder_outputs + weighted = torch.bmm(a, encoder_outputs) + weighted = weighted + rnn_input = torch.cat((embedded, weighted), dim=2) + output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0)) + + assert (output == hidden).all() + embedded = embedded.squeeze(0) + output = output.squeeze(0) + weighted = weighted.squeeze(0) + output = self.out(torch.cat((output, weighted, embedded), dim=1)) + return output, hidden.squeeze(0) + + +class Seq2Seq(pl.LightningModule): + def __init__(self, encoder, decoder, device): + super().__init__() + + self.encoder = encoder + self.decoder = decoder + self.device = device + + def forward(self, src, trg, teacher_forcing_ratio=0.5): + # src = [src sent len, batch size] + # trg = [trg sent len, batch size] + # teacher_forcing_ratio is probability to use teacher forcing + # e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time + batch_size = src.shape[1] + max_len = trg.shape[0] + trg_vocab_size = self.decoder.output_dim + # tensor to store decoder outputs + outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device) + # encoder_outputs is all hidden states of the input sequence, back and forwards + # hidden is the final forward and backward hidden states, passed through a linear layer + encoder_outputs, hidden = self.encoder(src) + # first input to the decoder is the tokens + output = trg[0, :] + for t in range(1, max_len): + output, hidden = self.decoder(output, hidden, encoder_outputs) + outputs[t] = output + teacher_force = random.random() < teacher_forcing_ratio + top1 = output.max(1)[1] + output = (trg[t] if teacher_force else top1) + + return outputs + + +# The code is adapted from https://github.com/keitakurita/Better_LSTM_PyTorch - no pip package is provided so we are +# cloning the code for testing + +class VariationalDropout(pl.LightningModule): + """ + Applies the same dropout mask across the temporal dimension + See https://arxiv.org/abs/1512.05287 for more details. + Note that this is not applied to the recurrent activations in the LSTM like the above paper. + Instead, it is applied to the inputs and outputs of the recurrent layer. + """ + + def __init__(self, dropout: float, batch_first: Optional[bool] = False): + super().__init__() + self.dropout = dropout + self.batch_first = batch_first + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self.training or self.dropout <= 0.: + return x + + is_packed = isinstance(x, PackedSequence) + if is_packed: + x, batch_sizes = x + max_batch_size = int(batch_sizes[0]) + else: + batch_sizes = None + if self.batch_first: + max_batch_size = x.size(0) + else: + max_batch_size = x.size(1) + + # Drop same mask across entire sequence + if self.batch_first: + m = x.new_empty(max_batch_size, 1, x.size(2), requires_grad=False).bernoulli_(1 - self.dropout) + else: + m = x.new_empty(1, max_batch_size, x.size(2), requires_grad=False).bernoulli_(1 - self.dropout) + x = x.masked_fill(m == 0, 0) / (1 - self.dropout) + + if is_packed: + return PackedSequence(x, batch_sizes) + else: + return x + + +class VariationalLSTM(pl.LightningModule): + def __init__(self, *args, dropouti: float = 0., + dropoutw: float = 0., dropouto: float = 0., + batch_first=True, unit_forget_bias=True, **kwargs): + super().__init__(*args, **kwargs, batch_first=batch_first) + self.unit_forget_bias = unit_forget_bias + self.dropoutw = dropoutw + self.input_drop = VariationalDropout(dropouti, + batch_first=batch_first) + self.output_drop = VariationalDropout(dropouto, + batch_first=batch_first) + self._init_weights() + + def _init_weights(self): + """ + Use orthogonal init for recurrent layers, xavier uniform for input layers + Bias is 0 except for forget gate + """ + for name, param in self.named_parameters(): + if "weight_hh" in name: + nn.init.orthogonal_(param.data) + elif "weight_ih" in name: + nn.init.xavier_uniform_(param.data) + elif "bias" in name and self.unit_forget_bias: + nn.init.zeros_(param.data) + param.data[self.hidden_size:2 * self.hidden_size] = 1 + + def _drop_weights(self): + for name, param in self.named_parameters(): + if "weight_hh" in name: + getattr(self, name).data = \ + torch.nn.functional.dropout(param.data, p=self.dropoutw, + training=self.training).contiguous() + + def forward(self, input, hx=None): + self._drop_weights() + self.flatten_parameters() + input = self.input_drop(input) + seq, state = super().forward(input, hx=hx) + return self.output_drop(seq), state + + +class ConvNorm(pl.LightningModule): + def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, + padding=None, dilation=1, bias=True, w_init_gain='linear'): + super(ConvNorm, self).__init__() + if padding is None: + assert (kernel_size % 2 == 1) + padding = int(dilation * (kernel_size - 1) / 2) + + self.conv = torch.nn.Conv1d(in_channels, out_channels, + kernel_size=kernel_size, stride=stride, + padding=padding, dilation=dilation, + bias=bias) + + torch.nn.init.xavier_normal_( + self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, signal): + conv_signal = self.conv(signal) + return conv_signal + + +# class WordGram(pl.LightningModule): +# def __init__(self, num_chars: int, num_langs: int, num_filters=512, char_emb_size=256, case_emb_size=32, +# lang_emb_size=32, num_layers=3): +# super(WordGram, self).__init__() +# NUM_FILTERS = num_filters +# self._num_filters = NUM_FILTERS +# self._lang_emb = nn.Embedding(num_langs + 1, lang_emb_size) +# self._tok_emb = nn.Embedding(num_chars + 1, char_emb_size) +# self._case_emb = nn.Embedding(4, case_emb_size) +# self._num_layers = num_layers +# +# convolutions_char = [] +# cs_inp = char_emb_size + lang_emb_size + case_emb_size +# for _ in range(num_layers): +# conv_layer = nn.Sequential( +# ConvNorm(cs_inp, +# NUM_FILTERS, +# kernel_size=5, stride=1, +# padding=2, +# dilation=1, w_init_gain='tanh'), +# nn.BatchNorm1d(NUM_FILTERS)) +# convolutions_char.append(conv_layer) +# cs_inp = NUM_FILTERS // 2 + lang_emb_size +# self._convolutions_char = nn.ModuleList(convolutions_char) +# +# self._rnn = nn.LSTM(NUM_FILTERS // 2, NUM_FILTERS // 2, num_layers=2) +# self._pre_out = LinearNorm(NUM_FILTERS // 2, NUM_FILTERS // 2) +# +# def forward(self, x_char, x_case, x_lang, x_mask, x_word_len): +# x_char = self._tok_emb(x_char) +# x_case = self._case_emb(x_case) +# x_lang = self._lang_emb(x_lang) +# +# x = torch.cat([x_char, x_case], dim=-1) +# x = x.permute(0, 2, 1) +# x_lang = x_lang.unsqueeze(1).repeat(1, x_case.shape[1], 1).permute(0, 2, 1) +# half = self._num_filters // 2 +# count = 0 +# res = None +# skip = None +# for conv in self._convolutions_char: +# count += 1 +# drop = self.training +# if count >= len(self._convolutions_char): +# drop = False +# if skip is not None: +# x = x + skip +# +# x = torch.cat([x, x_lang], dim=1) +# conv_out = conv(x) +# tmp = torch.tanh(conv_out[:, :half, :]) * torch.sigmoid((conv_out[:, half:, :])) +# if res is None: +# res = tmp +# else: +# res = res + tmp +# skip = tmp +# x = torch.dropout(tmp, 0.1, drop) +# x = x + res +# x = x.permute(0, 2, 1) +# x = torch.flip(x, dims=[1]) +# out, _ = self._rnn(x) +# norm = out[:, -1, :] +# +# return torch.tanh(self._pre_out(norm)) +# +# def _get_device(self): +# if self._lang_emb.weight.device.type == 'cpu': +# return 'cpu' +# return '{0}:{1}'.format(self._lang_emb.weight.device.type, str(self._lang_emb.weight.device.index)) +# +# def save(self, path): +# torch.save(self.state_dict(), path) +# +# def load(self, path): +# self.load_state_dict(torch.load(path, map_location='cpu')) + + +class WordGram(pl.LightningModule): + def __init__(self, num_chars: int, num_langs: int, num_filters=512, char_emb_size=256, case_emb_size=32, + lang_emb_size=32, num_layers=3): + super(WordGram, self).__init__() + NUM_FILTERS = num_filters + self._num_filters = NUM_FILTERS + self._lang_emb = nn.Embedding(num_langs + 1, lang_emb_size) + self._tok_emb = nn.Embedding(num_chars + 1, char_emb_size) + self._case_emb = nn.Embedding(4, case_emb_size) + self._num_layers = num_layers + convolutions_char = [] + cs_inp = char_emb_size + lang_emb_size + case_emb_size + for _ in range(num_layers): + conv_layer = nn.Sequential( + ConvNorm(cs_inp, + NUM_FILTERS, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(NUM_FILTERS)) + convolutions_char.append(conv_layer) + cs_inp = NUM_FILTERS // 2 + lang_emb_size + self._convolutions_char = nn.ModuleList(convolutions_char) + self._pre_out = LinearNorm(NUM_FILTERS // 2, NUM_FILTERS // 2) + + def forward(self, x_char, x_case, x_lang, x_mask, x_word_len): + x_char = self._tok_emb(x_char) + x_case = self._case_emb(x_case) + x_lang = self._lang_emb(x_lang) + + x = torch.cat([x_char, x_case], dim=-1) + x = x.permute(0, 2, 1) + x_lang = x_lang.unsqueeze(1).repeat(1, x_case.shape[1], 1).permute(0, 2, 1) + half = self._num_filters // 2 + count = 0 + res = None + skip = None + for conv in self._convolutions_char: + count += 1 + drop = self.training + if count >= len(self._convolutions_char): + drop = False + if skip is not None: + x = x + skip + + x = torch.cat([x, x_lang], dim=1) + conv_out = conv(x) + tmp = torch.tanh(conv_out[:, :half, :]) * torch.sigmoid((conv_out[:, half:, :])) + if res is None: + res = tmp + else: + res = res + tmp + skip = tmp + x = torch.dropout(tmp, 0.1, drop) + x = x + res + x = x.permute(0, 2, 1) + x = x * x_mask.unsqueeze(2) + pre = torch.sum(x, dim=1, dtype=torch.float) + norm = pre / torch.clip(x_word_len.unsqueeze(1), min=1) + return torch.tanh(self._pre_out(norm)) + + def _get_device(self): + if self._lang_emb.weight.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(self._lang_emb.weight.device.type, str(self._lang_emb.weight.device.index)) + + def save(self, path): + torch.save(self.state_dict(), path) + + def load(self, path): + self.load_state_dict(torch.load(path, map_location='cpu')) + + +''' +Code adapted from https://github.com/stanfordnlp/stanza/ +''' + + +class PairwiseBilinear(nn.Module): + ''' A bilinear module that deals with broadcasting for efficient memory usage. + Input: tensors of sizes (N x L1 x D1) and (N x L2 x D2) + Output: tensor of size (N x L1 x L2 x O)''' + + def __init__(self, input1_size, input2_size, output_size, bias=True): + super().__init__() + + self.input1_size = input1_size + self.input2_size = input2_size + self.output_size = output_size + + self.weight = nn.Parameter(torch.Tensor(input1_size, input2_size, output_size)) + self.bias = nn.Parameter(torch.Tensor(output_size)) if bias else 0 + + def forward(self, input1, input2): + input1_size = list(input1.size()) + input2_size = list(input2.size()) + output_size = [input1_size[0], input1_size[1], input2_size[1], self.output_size] + + # ((N x L1) x D1) * (D1 x (D2 x O)) -> (N x L1) x (D2 x O) + intermediate = torch.mm(input1.view(-1, input1_size[-1]), + self.weight.view(-1, self.input2_size * self.output_size)) + # (N x L2 x D2) -> (N x D2 x L2) + input2 = input2.transpose(1, 2) + # (N x (L1 x O) x D2) * (N x D2 x L2) -> (N x (L1 x O) x L2) + output = intermediate.view(input1_size[0], input1_size[1] * self.output_size, input2_size[2]).bmm(input2) + # (N x (L1 x O) x L2) -> (N x L1 x L2 x O) + output = output.view(input1_size[0], input1_size[1], self.output_size, input2_size[1]).transpose(2, 3) + + return output + + +class BiaffineScorer(nn.Module): + def __init__(self, input1_size, input2_size, output_size): + super().__init__() + self.W_bilin = nn.Bilinear(input1_size + 1, input2_size + 1, output_size) + + self.W_bilin.weight.data.zero_() + self.W_bilin.bias.data.zero_() + + def forward(self, input1, input2): + input1 = torch.cat([input1, input1.new_ones(*input1.size()[:-1], 1)], len(input1.size()) - 1) + input2 = torch.cat([input2, input2.new_ones(*input2.size()[:-1], 1)], len(input2.size()) - 1) + return self.W_bilin(input1, input2) + + +class PairwiseBiaffineScorer(nn.Module): + def __init__(self, input1_size, input2_size, output_size): + super().__init__() + self.W_bilin = PairwiseBilinear(input1_size + 1, input2_size + 1, output_size) + + self.W_bilin.weight.data.zero_() + self.W_bilin.bias.data.zero_() + + def forward(self, input1, input2): + input1 = torch.cat([input1, input1.new_ones(*input1.size()[:-1], 1)], len(input1.size()) - 1) + input2 = torch.cat([input2, input2.new_ones(*input2.size()[:-1], 1)], len(input2.size()) - 1) + return self.W_bilin(input1, input2) + + +class DeepBiaffine(nn.Module): + def __init__(self, input1_size, input2_size, hidden_size, output_size, hidden_func=F.relu, dropout=0, + pairwise=True): + super().__init__() + self.W1 = nn.Linear(input1_size, hidden_size) + self.W2 = nn.Linear(input2_size, hidden_size) + self.hidden_func = hidden_func + if pairwise: + self.scorer = PairwiseBiaffineScorer(hidden_size, hidden_size, output_size) + else: + self.scorer = BiaffineScorer(hidden_size, hidden_size, output_size) + self.dropout = nn.Dropout(dropout) + + def forward(self, input1, input2): + return self.scorer(self.dropout(self.hidden_func(self.W1(input1))), + self.dropout(self.hidden_func(self.W2(input2)))) diff --git a/cube/networks/parser.py b/cube/networks/parser.py new file mode 100644 index 000000000..0745aa3ee --- /dev/null +++ b/cube/networks/parser.py @@ -0,0 +1,483 @@ +import sys + +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel + +sys.path.append('') +import os, argparse + +os.environ["TOKENIZERS_PARALLELISM"] = "false" +import pytorch_lightning as pl +import torch.nn as nn +import torch.nn.functional as F +import torch +from torch.utils.data.dataset import Dataset +from torch.utils.data import DataLoader +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping +import numpy as np +from cube.io_utils.objects import Document, Sentence, Token, Word +from cube.io_utils.encodings import Encodings +from cube.io_utils.config import ParserConfig +import numpy as np +from cube.networks.modules import ConvNorm, LinearNorm, BilinearAttention, Attention, MLP, DeepBiaffine +import random + +from cube.networks.utils import MorphoCollate, MorphoDataset, GreedyDecoder, ChuLiuEdmondsDecoder, unpack, mask_concat + +from cube.networks.modules import WordGram + + +class Parser(pl.LightningModule): + def __init__(self, config: ParserConfig, encodings: Encodings, language_codes: [] = None, ext_word_emb=0): + super().__init__() + self._config = config + self._encodings = encodings + if not isinstance(ext_word_emb, list): + ext_word_emb = [ext_word_emb] + self._ext_word_emb = ext_word_emb + + self._word_net = WordGram(len(encodings.char2int), num_langs=encodings.num_langs + 1, + num_filters=config.char_filter_size, char_emb_size=config.char_emb_size, + lang_emb_size=config.lang_emb_size, num_layers=config.char_layers) + self._zero_emb = nn.Embedding(1, config.char_filter_size // 2) + self._num_langs = encodings.num_langs + self._language_codes = language_codes + + ext2int = [] + for input_size in self._ext_word_emb: + module = MLP(input_size, config.external_proj_size) + ext2int.append(module) + self._ext_proj = nn.ModuleList(ext2int) + + conv_layers = [] + cs_inp = config.char_filter_size // 2 + config.lang_emb_size + config.word_emb_size + config.external_proj_size + NUM_FILTERS = config.cnn_filter + for _ in range(config.cnn_layers): + conv_layer = nn.Sequential( + ConvNorm(cs_inp, + NUM_FILTERS, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(NUM_FILTERS)) + conv_layers.append(conv_layer) + cs_inp = NUM_FILTERS // 2 + config.lang_emb_size + self._word_emb = nn.Embedding(len(encodings.word2int), config.word_emb_size, padding_idx=0) + self._lang_emb = nn.Embedding(encodings.num_langs + 1, config.lang_emb_size, padding_idx=0) + self._convs = nn.ModuleList(conv_layers) + + self._aupos = LinearNorm(config.char_filter_size // 2 + config.lang_emb_size, len(encodings.upos2int)) + self._axpos = LinearNorm(config.char_filter_size // 2 + config.lang_emb_size, len(encodings.xpos2int)) + self._aattrs = LinearNorm(config.char_filter_size // 2 + config.lang_emb_size, len(encodings.attrs2int)) + + self._pre_morpho = LinearNorm(NUM_FILTERS // 2 + config.lang_emb_size, NUM_FILTERS // 2) + self._upos = LinearNorm(NUM_FILTERS // 2 + config.lang_emb_size, len(encodings.upos2int)) + self._attrs = LinearNorm(64 + NUM_FILTERS // 2 + config.lang_emb_size, len(encodings.attrs2int)) + self._xpos = LinearNorm(64 + NUM_FILTERS // 2 + config.lang_emb_size, len(encodings.xpos2int)) + self._upos_emb = nn.Embedding(len(encodings.upos2int), 64) + + self._rnn = nn.LSTM(NUM_FILTERS // 2 + config.lang_emb_size + config.external_proj_size, config.rnn_size, + num_layers=config.rnn_layers, batch_first=True, bidirectional=True, dropout=0.33) + + self._pre_out = LinearNorm(config.rnn_size * 2 + config.lang_emb_size, config.pre_parser_size) + # self._head_r1 = LinearNorm(config.pre_parser_size, config.head_size) + # self._head_r2 = LinearNorm(config.pre_parser_size, config.head_size) + # self._label_r1 = LinearNorm(config.pre_parser_size, config.label_size) + # self._label_r2 = LinearNorm(config.pre_parser_size, config.label_size) + # self._att_net = BilinearAttention(config.head_size, config.head_size) + # self._label_linear = nn.Linear(config.label_size * 2, len(encodings.label2int)) + # self._label_bilinear = nn.Bilinear(config.label_size, config.label_size, len(encodings.label2int)) + self._head = DeepBiaffine(config.pre_parser_size, config.pre_parser_size, config.head_size, 1, dropout=0.1) + self._label = DeepBiaffine(config.pre_parser_size, config.pre_parser_size, config.label_size, + len(encodings.label2int), dropout=0.1, pairwise=False) + self._r_emb = nn.Embedding(1, + config.char_filter_size // 2 + config.lang_emb_size + config.word_emb_size + config.external_proj_size) + + # self._decoder = GreedyDecoder() + self._decoder = ChuLiuEdmondsDecoder() + + if self._language_codes: + self._res = {} + for language_code in self._language_codes: + self._res[language_code] = {"upos": 0., "attrs": 0., 'uas': 0., 'las': 0.} + self._early_stop_meta_val = 0 + + def _compute_early_stop(self, res): + for lang in res: + if res[lang]["uas"] > self._res[lang]["uas"]: + self._early_stop_meta_val += 1 + self._res[lang]["uas"] = res[lang]["uas"] + res[lang]["uas_best"] = True + if res[lang]["las"] > self._res[lang]["las"]: + self._early_stop_meta_val += 1 + self._res[lang]["las"] = res[lang]["las"] + res[lang]["las_best"] = True + return res + + def forward(self, X): + x_sents = X['x_sent'] + x_lang_sent = X['x_lang_sent'] + x_words_chars = X['x_word'] + x_words_case = X['x_word_case'] + x_lang_word = X['x_lang_word'] + x_sent_len = X['x_sent_len'] + x_word_len = X['x_word_len'] + x_sent_masks = X['x_sent_masks'] + x_word_masks = X['x_word_masks'] + x_word_emb_packed = X['x_word_embeddings'] + gs_upos = None + if 'y_upos' in X: + gs_upos = X['y_upos'] + char_emb_packed = self._word_net(x_words_chars, x_words_case, x_lang_word, x_word_masks, x_word_len) + + sl = x_sent_len.cpu().numpy() + + char_emb = unpack(char_emb_packed, sl, x_sents.shape[1], device=self._get_device()) + word_emb_ext = None + + for ii in range(len(x_word_emb_packed)): + we = unpack(x_word_emb_packed[ii], sl, x_sents.shape[1], self._get_device()) + if word_emb_ext is None: + word_emb_ext = self._ext_proj[ii](we) + else: + word_emb_ext = word_emb_ext + self._ext_proj[ii](we) + + word_emb_ext = word_emb_ext / len(x_word_emb_packed) + word_emb_ext = torch.tanh(word_emb_ext) + + lang_emb = self._lang_emb(x_lang_sent) + lang_emb = lang_emb.unsqueeze(1).repeat(1, char_emb.shape[1] + 1, 1) + + aupos = self._aupos(torch.cat([char_emb, lang_emb[:, 1:, :]], dim=-1)) + aattrs = self._aattrs(torch.cat([char_emb, lang_emb[:, 1:, :]], dim=-1)) + axpos = self._axpos(torch.cat([char_emb, lang_emb[:, 1:, :]], dim=-1)) + + word_emb = self._word_emb(x_sents) + + x = mask_concat([word_emb, char_emb, word_emb_ext], 0.33, self.training, self._get_device()) + + x = torch.cat([x, lang_emb[:, 1:, :]], dim=-1) + # prepend root + root_emb = self._r_emb(torch.zeros((x.shape[0], 1), device=self._get_device(), dtype=torch.long)) + x = torch.cat([root_emb, x], dim=1) + x = x.permute(0, 2, 1) + lang_emb = lang_emb.permute(0, 2, 1) + half = self._config.cnn_filter // 2 + res = None + hidden = None + cnt = 0 + for conv in self._convs: + conv_out = conv(x) + tmp = torch.tanh(conv_out[:, :half, :]) * torch.sigmoid((conv_out[:, half:, :])) + if res is None: + res = tmp + else: + res = res + tmp + x = torch.dropout(tmp, 0.2, self.training) + cnt += 1 + if cnt == self._config.aux_softmax_location: + hidden = torch.cat([x + res, lang_emb], dim=1) + if cnt != self._config.cnn_layers: + x = torch.cat([x, lang_emb], dim=1) + + x = x + res + x_parse = x.permute(0, 2, 1) + # aux tagging + lang_emb = lang_emb.permute(0, 2, 1) + hidden = hidden.permute(0, 2, 1)[:, 1:, :] + pre_morpho = torch.dropout(torch.tanh(self._pre_morpho(hidden)), 0.33, self.training) + pre_morpho = torch.cat([pre_morpho, lang_emb[:, 1:, :]], dim=2) + upos = self._upos(pre_morpho) + if gs_upos is None: + upos_idx = torch.argmax(upos, dim=-1) + else: + upos_idx = gs_upos + + upos_emb = self._upos_emb(upos_idx) + attrs = self._attrs(torch.cat([pre_morpho, upos_emb], dim=-1)) + xpos = self._xpos(torch.cat([pre_morpho, upos_emb], dim=-1)) + + # parsing + word_emb_ext = torch.cat( + [torch.zeros((word_emb_ext.shape[0], 1, self._config.external_proj_size), device=self._get_device(), + dtype=torch.float), word_emb_ext], dim=1) + x = mask_concat([x_parse, word_emb_ext], 0.33, self.training, self._get_device()) + x = torch.cat([x, lang_emb], dim=-1) + output, _ = self._rnn(x) + output = torch.cat([output, lang_emb], dim=-1) + pre_parsing = torch.dropout(torch.tanh(self._pre_out(output)), 0.33, self.training) + # h_r1 = torch.tanh(self._head_r1(pre_parsing)) + # h_r2 = torch.tanh(self._head_r2(pre_parsing)) + # l_r1 = torch.tanh(self._label_r1(pre_parsing)) + # l_r2 = torch.tanh(self._label_r2(pre_parsing)) + # att_stack = [] + # for ii in range(1, h_r1.shape[1]): + # a = self._att_net(h_r1[:, ii, :], h_r2) + # att_stack.append(a.unsqueeze(1)) + # att = torch.cat(att_stack, dim=1) + heads = self._head(pre_parsing, pre_parsing) + labels = pre_parsing # self._label(pre_parsing, pre_parsing) + return heads.squeeze(-1)[:, 1:, :], labels, upos, xpos, attrs, aupos, axpos, aattrs + + def _get_labels(self, labels, heads): + x1 = labels + labs = [] + for ii in range(labels.shape[0]): + lab = [] + for jj in range(1, labels.shape[1]): + if jj <= len(heads[ii]): + lab.append(labels[ii, heads[ii][jj - 1]].unsqueeze(0).unsqueeze(0)) + else: + lab.append(labels[ii, 0].unsqueeze(0).unsqueeze(0)) + lab = torch.cat(lab, dim=1) + labs.append(lab) + x2 = torch.cat(labs, dim=0) + + labs = self._label(x1[:, 1:, :], x2) + return labs + + # def _get_labels(self, x1, x2, heads): + # x1 = x1[:, 1:, :] + # x_stack = [] + # for ii in range(x1.shape[0]): + # xx = [] + # for jj in range(x1.shape[1]): + # if jj < len(heads[ii]): + # xx.append(x2[ii, heads[ii][jj]].unsqueeze(0).unsqueeze(0)) + # else: + # xx.append(x2[ii, 0].unsqueeze(0).unsqueeze(0)) + # x_stack.append(torch.cat(xx, dim=1)) + # x_stack = torch.cat(x_stack, dim=0).contiguous() + # x1 = x1.contiguous() + # hid = torch.cat([x1, x_stack], dim=-1) + # return self._label_linear(hid) + self._label_bilinear(x1, x_stack) + + def _get_device(self): + if self._lang_emb.weight.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(self._lang_emb.weight.device.type, str(self._lang_emb.weight.device.index)) + + def configure_optimizers(self): + optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3, weight_decay=1e-4) + return optimizer + + def training_step(self, batch, batch_idx): + att, labels, p_upos, p_xpos, p_attrs, a_upos, a_xpos, a_attrs = self.forward(batch) + y_upos = batch['y_upos'] + y_attrs = batch['y_attrs'] + y_xpos = batch['y_xpos'] + y_head = batch['y_head'] + y_label = batch['y_label'] + + pred_labels = self._get_labels(labels, y_head.detach().cpu().numpy()) + + loss_upos = F.cross_entropy(p_upos.view(-1, p_upos.shape[2]), y_upos.view(-1), ignore_index=0) + loss_xpos = F.cross_entropy(p_xpos.view(-1, p_xpos.shape[2]), y_xpos.view(-1), ignore_index=0) + loss_attrs = F.cross_entropy(p_attrs.view(-1, p_attrs.shape[2]), y_attrs.view(-1), ignore_index=0) + + loss_aupos = F.cross_entropy(a_upos.view(-1, a_upos.shape[2]), y_upos.view(-1), ignore_index=0) + loss_axpos = F.cross_entropy(a_xpos.view(-1, a_xpos.shape[2]), y_xpos.view(-1), ignore_index=0) + loss_aattrs = F.cross_entropy(a_attrs.view(-1, a_attrs.shape[2]), y_attrs.view(-1), ignore_index=0) + + loss_uas = F.cross_entropy(att.reshape(-1, att.shape[2]), y_head.view(-1)) + loss_las = F.cross_entropy(pred_labels.view(-1, pred_labels.shape[2]), y_label.view(-1), ignore_index=0) + + step_loss = loss_uas + loss_las + (((loss_upos + loss_attrs + loss_xpos) / 3.) + ( + (loss_aupos + loss_aattrs + loss_axpos) / 3.)) + + return {'loss': step_loss} + + def validation_step(self, batch, batch_idx): + y_upos = batch['y_upos'] + del batch['y_upos'] + att, labels, p_upos, p_xpos, p_attrs, a_upos, a_xpos, a_attrs = self.forward(batch) + y_xpos = batch['y_xpos'] + y_attrs = batch['y_attrs'] + y_head = batch['y_head'] + y_label = batch['y_label'] + x_sent_len = batch['x_sent_len'] + x_lang = batch['x_lang_sent'] + sl = x_sent_len.detach().cpu().numpy() + + att = torch.softmax(att, dim=-1).detach().cpu().numpy() + pred_heads = self._decoder.decode(att, sl) + pred_labels = self._get_labels(labels, pred_heads) + + loss_upos = F.cross_entropy(p_upos.view(-1, p_upos.shape[2]), y_upos.view(-1), ignore_index=0) + loss_attrs = F.cross_entropy(p_attrs.view(-1, p_attrs.shape[2]), y_attrs.view(-1), ignore_index=0) + loss = (loss_upos + loss_attrs) / 3 + language_result = {lang_id: {'total': 0, 'upos_ok': 0, 'xpos_ok': 0, 'attrs_ok': 0, 'uas_ok': 0, 'las_ok': 0} + for lang_id in range(self._num_langs)} + + pred_upos = torch.argmax(p_upos, dim=-1).detach().cpu().numpy() + pred_attrs = torch.argmax(p_attrs, dim=-1).detach().cpu().numpy() + pred_xpos = torch.argmax(p_xpos, dim=-1).detach().cpu().numpy() + pred_labels = torch.argmax(pred_labels, dim=-1).detach().cpu().numpy() + tar_upos = y_upos.detach().cpu().numpy() + tar_attrs = y_attrs.detach().cpu().numpy() + tar_xpos = y_xpos.detach().cpu().numpy() + tar_head = y_head.detach().cpu().numpy() + tar_label = y_label.detach().cpu().numpy() + + x_lang = x_lang.detach().cpu().numpy() + for iSent in range(p_upos.shape[0]): + for iWord in range(sl[iSent]): + lang_id = x_lang[iSent] - 1 + language_result[lang_id]['total'] += 1 + if pred_upos[iSent, iWord] == tar_upos[iSent, iWord]: + language_result[lang_id]['upos_ok'] += 1 + if pred_attrs[iSent, iWord] == tar_attrs[iSent, iWord]: + language_result[lang_id]['attrs_ok'] += 1 + if pred_xpos[iSent, iWord] == tar_xpos[iSent, iWord]: + language_result[lang_id]['xpos_ok'] += 1 + if pred_heads[iSent][iWord] == tar_head[iSent, iWord]: + language_result[lang_id]['uas_ok'] += 1 + if pred_labels[iSent, iWord] == tar_label[iSent, iWord]: + language_result[lang_id]['las_ok'] += 1 + + return {'loss': loss, 'acc': language_result} + + def validation_epoch_end(self, outputs): + language_result = {lang_id: {'total': 0, 'upos_ok': 0, 'attrs_ok': 0, 'xpos_ok': 0, 'uas_ok': 0, 'las_ok': 0} + for lang_id in + range(self._num_langs)} + + valid_loss_total = 0 + total = 0 + attrs_ok = 0 + upos_ok = 0 + xpos_ok = 0 + uas_ok = 0 + las_ok = 0 + for out in outputs: + valid_loss_total += out['loss'] + for lang_id in language_result: + valid_loss_total += out['loss'] + language_result[lang_id]['total'] += out['acc'][lang_id]['total'] + language_result[lang_id]['upos_ok'] += out['acc'][lang_id]['upos_ok'] + language_result[lang_id]['xpos_ok'] += out['acc'][lang_id]['xpos_ok'] + language_result[lang_id]['attrs_ok'] += out['acc'][lang_id]['attrs_ok'] + language_result[lang_id]['uas_ok'] += out['acc'][lang_id]['uas_ok'] + language_result[lang_id]['las_ok'] += out['acc'][lang_id]['las_ok'] + # global + total += out['acc'][lang_id]['total'] + upos_ok += out['acc'][lang_id]['upos_ok'] + xpos_ok += out['acc'][lang_id]['xpos_ok'] + attrs_ok += out['acc'][lang_id]['attrs_ok'] + uas_ok += out['acc'][lang_id]['uas_ok'] + las_ok += out['acc'][lang_id]['las_ok'] + + self.log('val/loss', valid_loss_total / len(outputs)) + self.log('val/UPOS/total', upos_ok / total) + self.log('val/ATTRS/total', attrs_ok / total) + self.log('val/XPOS/total', xpos_ok / total) + self.log('val/UAS/total', uas_ok / total) + self.log('val/LAS/total', las_ok / total) + + res = {} + for lang_index in language_result: + total = language_result[lang_index]['total'] + if total == 0: + total = 1 + if self._language_codes is None: + lang = lang_index + else: + lang = self._language_codes[lang_index] + res[lang] = { + "upos": language_result[lang_index]['upos_ok'] / total, + "xpos": language_result[lang_index]['xpos_ok'] / total, + "attrs": language_result[lang_index]['attrs_ok'] / total, + "uas": language_result[lang_index]['uas_ok'] / total, + "las": language_result[lang_index]['las_ok'] / total + } + + self.log('val/UPOS/{0}'.format(lang), language_result[lang_index]['upos_ok'] / total) + self.log('val/XPOS/{0}'.format(lang), language_result[lang_index]['xpos_ok'] / total) + self.log('val/ATTRS/{0}'.format(lang), language_result[lang_index]['attrs_ok'] / total) + self.log('val/UAS/{0}'.format(lang), language_result[lang_index]['uas_ok'] / total) + self.log('val/LAS/{0}'.format(lang), language_result[lang_index]['las_ok'] / total) + + # single value for early stopping + self._epoch_results = self._compute_early_stop(res) + self.log('val/early_meta', self._early_stop_meta_val) + + def load(self, model_path: str, device: str = 'cpu'): + self.load_state_dict(torch.load(model_path, map_location='cpu')['state_dict']) + self.to(device) + + def process(self, doc: Document, collate: MorphoCollate, batch_size: int = 4, num_workers: int = 4) -> Document: + self.eval() + dataset = MorphoDataset(doc) + + dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate.collate_fn, + shuffle=False, num_workers=num_workers) + index = 0 + with torch.no_grad(): + for batch in dataloader: + + del batch['y_upos'] + for key in batch: + if isinstance(batch[key], torch.Tensor): + batch[key] = batch[key].to(self._device) + att, labels, p_upos, p_xpos, p_attrs, a_upos, a_xpos, a_attrs = self.forward(batch) + + x_sent_len = batch['x_sent_len'] + sl = x_sent_len.detach().cpu().numpy() + + batch_size = p_upos.size()[0] + + att = torch.softmax(att, dim=-1).detach().cpu().numpy() + pred_heads = self._decoder.decode(att, sl) + pred_labels = self._get_labels(labels, pred_heads) + pred_labels = torch.argmax(pred_labels.detach(), dim=-1).cpu() + p_upos = torch.argmax(p_upos, dim=-1).detach().cpu().numpy() + p_xpos = torch.argmax(p_xpos, dim=-1).detach().cpu().numpy() + p_attrs = torch.argmax(p_attrs, dim=-1).detach().cpu().numpy() + for sentence_index in range(batch_size): # for each sentence + # print(f"at index {index+sentence_index}, sentence {sentence_index} has {batch['x_sent_len'][sentence_index]} words.") + for word_index in range(batch["x_sent_len"][sentence_index]): + head = pred_heads[sentence_index][word_index] + label_id = pred_labels[sentence_index][word_index] + doc.sentences[index + sentence_index].words[word_index].head = head + doc.sentences[index + sentence_index].words[word_index].label = self._encodings.labels[label_id] + doc.sentences[index + sentence_index].words[word_index].upos = self._encodings.upos_list[ + p_upos[sentence_index, word_index]] + doc.sentences[index + sentence_index].words[word_index].xpos = self._encodings.xpos_list[ + p_xpos[sentence_index, word_index]] + doc.sentences[index + sentence_index].words[word_index].attrs = self._encodings.attrs_list[ + p_attrs[sentence_index, word_index]] + + index += batch_size + + return doc + + class PrintAndSaveCallback(pl.callbacks.Callback): + def __init__(self, store_prefix): + super().__init__() + self.store_prefix = store_prefix + + def on_validation_end(self, trainer, pl_module): + metrics = trainer.callback_metrics + epoch = trainer.current_epoch + + for lang in pl_module._epoch_results: + res = pl_module._epoch_results[lang] + if "uas_best" in res: + trainer.save_checkpoint(self.store_prefix + "." + lang + ".uas") + if "las_best" in res: + trainer.save_checkpoint(self.store_prefix + "." + lang + ".las") + + trainer.save_checkpoint(self.store_prefix + ".last") + + s = "{0:30s}\tUAS\tLAS\tUPOS\tXPOS\tATTRS".format("Language") + print("\n\n\t" + s) + print("\t" + ("=" * (len(s) + 16))) + for lang in pl_module._language_codes: + uas = metrics["val/UAS/{0}".format(lang)] + las = metrics["val/LAS/{0}".format(lang)] + upos = metrics["val/UPOS/{0}".format(lang)] + xpos = metrics["val/XPOS/{0}".format(lang)] + attrs = metrics["val/ATTRS/{0}".format(lang)] + msg = "\t{0:30s}:\t{1:.4f}\t{2:.4f}\t{3:.4f}\t{4:.4f}\t{5:.4f}".format(lang, uas, las, upos, xpos, + attrs) + print(msg) + print("\n") diff --git a/cube/networks/tagger.py b/cube/networks/tagger.py new file mode 100644 index 000000000..6968d0767 --- /dev/null +++ b/cube/networks/tagger.py @@ -0,0 +1,412 @@ +import sys +sys.path.append('') +import os, yaml +os.environ["TOKENIZERS_PARALLELISM"] = "false" +import pytorch_lightning as pl +import torch.nn as nn +import torch.nn.functional as F +import torch +from torch.utils.data import DataLoader +from cube.io_utils.objects import Document +from cube.io_utils.encodings import Encodings +from cube.io_utils.config import TaggerConfig +from cube.networks.modules import ConvNorm, LinearNorm, MLP +from cube.networks.utils import MorphoCollate, MorphoDataset, unpack, mask_concat +from cube.networks.modules import WordGram + +class Tagger(pl.LightningModule): + def __init__(self, config: TaggerConfig, encodings: Encodings, language_codes: [] = None, ext_word_emb=0): + super().__init__() + self._device = "cpu" # default + self._config = config + self._encodings = encodings + if not isinstance(ext_word_emb, list): + ext_word_emb = [ext_word_emb] + self._ext_word_emb = ext_word_emb + + self._word_net = WordGram(len(encodings.char2int), num_langs=encodings.num_langs + 1, + num_filters=config.char_filter_size, char_emb_size=config.char_emb_size, + lang_emb_size=config.lang_emb_size, num_layers=config.char_layers) + self._zero_emb = nn.Embedding(1, config.char_filter_size // 2) + self._num_langs = encodings.num_langs + self._language_codes = language_codes + + conv_layers = [] + cs_inp = config.char_filter_size // 2 + config.lang_emb_size + config.word_emb_size + config.external_proj_size + NUM_FILTERS = config.cnn_filter + for _ in range(config.cnn_layers): + conv_layer = nn.Sequential( + ConvNorm(cs_inp, + NUM_FILTERS, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(NUM_FILTERS)) + conv_layers.append(conv_layer) + cs_inp = NUM_FILTERS // 2 + config.lang_emb_size + + ext2int = [] + for input_size in self._ext_word_emb: + module = MLP(input_size, config.external_proj_size) + ext2int.append(module) + self._ext_proj = nn.ModuleList(ext2int) + self._word_emb = nn.Embedding(len(encodings.word2int), config.word_emb_size, padding_idx=0) + self._lang_emb = nn.Embedding(encodings.num_langs + 1, config.lang_emb_size, padding_idx=0) + self._convs = nn.ModuleList(conv_layers) + self._upos = LinearNorm(NUM_FILTERS // 2 + config.lang_emb_size, len(encodings.upos2int)) + self._upos_emb = nn.Embedding(len(encodings.upos2int), 64) + self._xpos = LinearNorm(64 + config.lang_emb_size + NUM_FILTERS // 2, len(encodings.xpos2int)) + self._attrs = LinearNorm(64 + config.lang_emb_size + NUM_FILTERS // 2, len(encodings.attrs2int)) + + self._aupos = LinearNorm(config.char_filter_size // 2 + config.lang_emb_size, len(encodings.upos2int)) + self._axpos = LinearNorm(config.char_filter_size // 2 + config.lang_emb_size, len(encodings.xpos2int)) + self._aattrs = LinearNorm(config.char_filter_size // 2 + config.lang_emb_size, len(encodings.attrs2int)) + + if self._language_codes: + self._res = {} + for language_code in self._language_codes: + self._res[language_code] = {"upos": 0., "xpos": 0., "attrs": 0.} + self._early_stop_meta_val = 0 + + def _compute_early_stop(self, res): + for lang in res: + if res[lang]["upos"] > self._res[lang]["upos"]: + self._early_stop_meta_val += 1 + self._res[lang]["upos"] = res[lang]["upos"] + res[lang]["upos_best"] = True + if res[lang]["xpos"] > self._res[lang]["xpos"]: + self._early_stop_meta_val += 1 + self._res[lang]["xpos"] = res[lang]["xpos"] + res[lang]["xpos_best"] = True + if res[lang]["attrs"] > self._res[lang]["attrs"]: + self._early_stop_meta_val += 1 + self._res[lang]["attrs"] = res[lang]["attrs"] + res[lang]["attrs_best"] = True + return res + + def forward(self, X): + x_sents = X['x_sent'] + x_lang_sent = X['x_lang_sent'] + x_words_chars = X['x_word'] + x_words_case = X['x_word_case'] + x_lang_word = X['x_lang_word'] + x_sent_len = X['x_sent_len'] + x_word_len = X['x_word_len'] + x_sent_masks = X['x_sent_masks'] + x_word_masks = X['x_word_masks'] + x_word_emb_packed = X['x_word_embeddings'] + char_emb_packed = self._word_net(x_words_chars, x_words_case, x_lang_word, x_word_masks, x_word_len) + gs_upos = None + if 'y_upos' in X: + gs_upos = X['y_upos'] + sl = x_sent_len.cpu().numpy() + + char_emb = unpack(char_emb_packed, sl, x_sents.shape[1], device=self._get_device()) + word_emb_ext = None + + for ii in range(len(x_word_emb_packed)): + we = unpack(x_word_emb_packed[ii], sl, x_sents.shape[1], self._get_device()) + if word_emb_ext is None: + word_emb_ext = self._ext_proj[ii](we) + else: + word_emb_ext = word_emb_ext + self._ext_proj[ii](we) + + word_emb_ext = word_emb_ext / len(x_word_emb_packed) + word_emb_ext = torch.tanh(word_emb_ext) + + lang_emb = self._lang_emb(x_lang_sent) + lang_emb = lang_emb.unsqueeze(1).repeat(1, char_emb.shape[1], 1) + + aupos = self._aupos(torch.cat([char_emb, lang_emb], dim=-1)) + axpos = self._axpos(torch.cat([char_emb, lang_emb], dim=-1)) + aattrs = self._aattrs(torch.cat([char_emb, lang_emb], dim=-1)) + + word_emb = self._word_emb(x_sents) + + x = mask_concat([word_emb, char_emb, word_emb_ext], 0.33, self.training, self._get_device()) + x = torch.cat([x, lang_emb], dim=-1) + x = x.permute(0, 2, 1) + lang_emb = lang_emb.permute(0, 2, 1) + half = self._config.cnn_filter // 2 + res = None + cnt = 0 + for conv in self._convs: + conv_out = conv(x) + tmp = torch.tanh(conv_out[:, :half, :]) * torch.sigmoid((conv_out[:, half:, :])) + if res is None: + res = tmp + else: + res = res + tmp + x = torch.dropout(tmp, 0.2, self.training) + cnt += 1 + if cnt != self._config.cnn_layers: + x = torch.cat([x, lang_emb], dim=1) + x = x + res + xu = x.permute(0, 2, 1).contiguous() + x = torch.cat([x, lang_emb], dim=1) + x = x.permute(0, 2, 1) + # x = torch.tanh(x) + upos = self._upos(x) + if gs_upos is None: + upos_idx = torch.argmax(upos, dim=-1) + else: + upos_idx = gs_upos + upos_emb = self._upos_emb(upos_idx) + upos_emb = torch.cat([upos_emb, lang_emb.permute(0, 2, 1)], dim=-1) + xpos = self._xpos(torch.cat([upos_emb, xu], dim=-1)) + attrs = self._attrs(torch.cat([upos_emb, xu], dim=-1)) + return upos, xpos, attrs, aupos, axpos, aattrs + + def _get_device(self): + if self._lang_emb.weight.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(self._lang_emb.weight.device.type, str(self._lang_emb.weight.device.index)) + + def configure_optimizers(self): + optimizer = torch.optim.AdamW(self.parameters()) + return optimizer + + def training_step(self, batch, batch_idx): + p_upos, p_xpos, p_attrs, a_upos, a_xpos, a_attrs = self.forward(batch) + y_upos = batch['y_upos'] + y_xpos = batch['y_xpos'] + y_attrs = batch['y_attrs'] + + loss_upos = F.cross_entropy(p_upos.view(-1, p_upos.shape[2]), y_upos.view(-1), ignore_index=0) + loss_xpos = F.cross_entropy(p_xpos.reshape(-1, p_xpos.shape[2]), y_xpos.view(-1), ignore_index=0) + loss_attrs = F.cross_entropy(p_attrs.reshape(-1, p_attrs.shape[2]), y_attrs.view(-1), ignore_index=0) + + loss_aupos = F.cross_entropy(a_upos.view(-1, a_upos.shape[2]), y_upos.view(-1), ignore_index=0) + loss_axpos = F.cross_entropy(a_xpos.view(-1, a_xpos.shape[2]), y_xpos.view(-1), ignore_index=0) + loss_aattrs = F.cross_entropy(a_attrs.view(-1, a_attrs.shape[2]), y_attrs.view(-1), ignore_index=0) + + step_loss = ((loss_upos + loss_attrs + loss_xpos) / 3.) * 1.0 + ( + (loss_aupos + loss_aattrs + loss_axpos) / 3.) * 1.0 + + return {'loss': step_loss} + + def validation_step(self, batch, batch_idx): + y_upos = batch['y_upos'] + y_xpos = batch['y_xpos'] + y_attrs = batch['y_attrs'] + x_sent_len = batch['x_sent_len'] + x_lang = batch['x_lang_sent'] + del batch['y_upos'] + p_upos, p_xpos, p_attrs, a_upos, a_xpos, a_attrs = self.forward(batch) + + loss_upos = F.cross_entropy(p_upos.view(-1, p_upos.shape[2]), y_upos.view(-1), ignore_index=0) + loss_xpos = F.cross_entropy(p_xpos.reshape(-1, p_xpos.shape[2]), y_xpos.view(-1), ignore_index=0) + loss_attrs = F.cross_entropy(p_attrs.reshape(-1, p_attrs.shape[2]), y_attrs.view(-1), ignore_index=0) + loss = (loss_upos + loss_attrs + loss_xpos) / 3 + language_result = {lang_id: {'total': 0, 'upos_ok': 0, 'xpos_ok': 0, 'attrs_ok': 0} for lang_id in + range(self._num_langs)} + + pred_upos = torch.argmax(p_upos, dim=-1).detach().cpu().numpy() + pred_xpos = torch.argmax(p_xpos, dim=-1).detach().cpu().numpy() + pred_attrs = torch.argmax(p_attrs, dim=-1).detach().cpu().numpy() + tar_upos = y_upos.detach().cpu().numpy() + tar_xpos = y_xpos.detach().cpu().numpy() + tar_attrs = y_attrs.detach().cpu().numpy() + sl = x_sent_len.detach().cpu().numpy() + x_lang = x_lang.detach().cpu().numpy() + for iSent in range(p_upos.shape[0]): + for iWord in range(sl[iSent]): + lang_id = x_lang[iSent] - 1 + language_result[lang_id]['total'] += 1 + if pred_upos[iSent, iWord] == tar_upos[iSent, iWord]: + language_result[lang_id]['upos_ok'] += 1 + if pred_xpos[iSent, iWord] == tar_xpos[iSent, iWord]: + language_result[lang_id]['xpos_ok'] += 1 + if pred_attrs[iSent, iWord] == tar_attrs[iSent, iWord]: + language_result[lang_id]['attrs_ok'] += 1 + + return {'loss': loss, 'acc': language_result} + + def validation_epoch_end(self, outputs): + language_result = {lang_index: {'total': 0, 'upos_ok': 0, 'xpos_ok': 0, 'attrs_ok': 0} for lang_index in + range(self._num_langs)} + + valid_loss_total = 0 + total = 0 + attrs_ok = 0 + upos_ok = 0 + xpos_ok = 0 + for out in outputs: + valid_loss_total += out['loss'] + for lang_index in language_result: + valid_loss_total += out['loss'] + language_result[lang_index]['total'] += out['acc'][lang_index]['total'] + language_result[lang_index]['upos_ok'] += out['acc'][lang_index]['upos_ok'] + language_result[lang_index]['xpos_ok'] += out['acc'][lang_index]['xpos_ok'] + language_result[lang_index]['attrs_ok'] += out['acc'][lang_index]['attrs_ok'] + # global + total += out['acc'][lang_index]['total'] + upos_ok += out['acc'][lang_index]['upos_ok'] + xpos_ok += out['acc'][lang_index]['xpos_ok'] + attrs_ok += out['acc'][lang_index]['attrs_ok'] + + self.log('val/loss', valid_loss_total / len(outputs)) + self.log('val/UPOS/total', upos_ok / total) + self.log('val/XPOS/total', xpos_ok / total) + self.log('val/ATTRS/total', attrs_ok / total) + + res = {} + for lang_index in language_result: + total = language_result[lang_index]['total'] + if total == 0: + total = 1 + if self._language_codes is None: + lang = lang_index + else: + lang = self._language_codes[lang_index] + res[lang] = { + "upos": language_result[lang_index]['upos_ok'] / total, + "xpos": language_result[lang_index]['xpos_ok'] / total, + "attrs": language_result[lang_index]['attrs_ok'] / total + } + + self.log('val/UPOS/{0}'.format(lang), language_result[lang_index]['upos_ok'] / total) + self.log('val/XPOS/{0}'.format(lang), language_result[lang_index]['xpos_ok'] / total) + self.log('val/ATTRS/{0}'.format(lang), language_result[lang_index]['attrs_ok'] / total) + + # single value for early stopping + self._epoch_results = self._compute_early_stop(res) + self.log('val/early_meta', self._early_stop_meta_val) + + # print("\n\n\n", upos_ok / total, xpos_ok / total, attrs_ok / total, + # aupos_ok / total, axpos_ok / total, aattrs_ok / total, "\n\n\n") + + def load(self, model_path:str, device: str = 'cpu'): + self.load_state_dict(torch.load(model_path, map_location='cpu')['state_dict']) + self.to(device) + + def process(self, doc: Document, collate: MorphoCollate, upos: bool = True, xpos: bool = True, attrs: bool = True, + batch_size: int = 32, num_workers: int = 4) -> Document: + self.eval() + if not (upos or xpos or attrs): + raise Exception("To perform tagging at least one of 'upos', 'xpos' or 'attrs' must be set to True.") + + dataset = MorphoDataset(doc) + + dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate.collate_fn, + shuffle=False, num_workers=num_workers, pin_memory=True) + index = 0 + + with torch.no_grad(): + for batch in dataloader: + del batch['y_upos'] + p_upos, p_xpos, p_attrs, _, _, _ = self.forward(batch) + + batch_size = p_upos.size()[0] + + if upos: + pred_upos = torch.argmax(p_upos, dim=-1).detach().cpu().numpy() + if xpos: + pred_xpos = torch.argmax(p_xpos, dim=-1).detach().cpu().numpy() + if attrs: + pred_attrs = torch.argmax(p_attrs, dim=-1).detach().cpu().numpy() + + for sentence_index in range(batch_size): # for each sentence + # print(f"at index {index+sentence_index}, sentence {sentence_index} has {batch['x_sent_len'][sentence_index]} words.") + for word_index in range(batch["x_sent_len"][sentence_index]): + if upos: + doc.sentences[index + sentence_index].words[word_index].upos = self._encodings.upos_list[ + pred_upos[sentence_index][word_index]] + if xpos: + doc.sentences[index + sentence_index].words[word_index].xpos = self._encodings.xpos_list[ + pred_xpos[sentence_index][word_index]] + if attrs: + doc.sentences[index + sentence_index].words[word_index].attrs = self._encodings.attrs_list[ + pred_attrs[sentence_index][word_index]] + + index += batch_size + # break + return doc + + class PrintAndSaveCallback(pl.callbacks.Callback): + def __init__(self, store_prefix): + super().__init__() + self.store_prefix = store_prefix + + def on_validation_end(self, trainer, pl_module): + metrics = trainer.callback_metrics + epoch = trainer.current_epoch + + for lang in pl_module._epoch_results: + res = pl_module._epoch_results[lang] + if "upos_best" in res: + trainer.save_checkpoint(self.store_prefix + "." + lang + ".upos") + if "xpos_best" in res: + trainer.save_checkpoint(self.store_prefix + "." + lang + ".xpos") + if "attrs_best" in res: + trainer.save_checkpoint(self.store_prefix + "." + lang + ".attrs") + + trainer.save_checkpoint(self.store_prefix + ".last") + + s = "{0:30s}\tUPOS\tXPOS\tATTRS".format("Language") + print("\n\n\t" + s) + print("\t" + ("=" * (len(s) + 9))) + for lang in pl_module._language_codes: + upos = metrics["val/UPOS/{0}".format(lang)] + xpos = metrics["val/XPOS/{0}".format(lang)] + attrs = metrics["val/ATTRS/{0}".format(lang)] + msg = "\t{0:30s}:\t{1:.4f}\t{2:.4f}\t{3:.4f}".format(lang, upos, xpos, attrs) + print(msg) + print("\n") + + +if __name__ == '__main__': + + root = "data/be" + language_code = "be_hse" + device = 'cpu' # 'cuda' + batch_size = 2 + + # read yaml + object_config = yaml.full_load(open(root + ".yaml")) + + # read model config + config = TaggerConfig(filename=root + ".config") + + # read encodings + encodings = Encodings() + encodings.load(filename=root + ".encodings") + + # load models + tagger_UPOS = Tagger.load_from_checkpoint(root + "." + language_code + ".upos", config=config, encodings=encodings, + language_codes=object_config["language_codes"]) + tagger_UPOS.to(device) + tagger_UPOS.eval() + tagger_UPOS.freeze() + + tagger_XPOS = Tagger.load_from_checkpoint(root + "." + language_code + ".xpos", config=config, encodings=encodings, + language_codes=object_config["language_codes"]) + tagger_XPOS.to(device) + tagger_XPOS.eval() + tagger_XPOS.freeze() + + tagger_ATTRS = Tagger.load_from_checkpoint(root + "." + language_code + ".attrs", config=config, + encodings=encodings, + language_codes=object_config["language_codes"]) + tagger_ATTRS.to(device) + tagger_ATTRS.eval() + tagger_ATTRS.freeze() + + # read a doc + doc = Document() + doc.load("corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-dev.conllu", + lang_id=object_config["language_codes"].index(language_code)) + + for si, _ in enumerate(doc.sentences): + for wi, _ in enumerate(doc.sentences[si].words): + doc.sentences[si].words[wi].upos = "" + doc.sentences[si].words[wi].xpos = "" + doc.sentences[si].words[wi].attrs = "" + + print(doc) + + doc = tagger_UPOS.process(doc, upos=True, xpos=False, attrs=False, batch_size=batch_size) + doc = tagger_XPOS.process(doc, upos=False, xpos=True, attrs=False, batch_size=batch_size) + doc = tagger_ATTRS.process(doc, upos=False, xpos=False, attrs=True, batch_size=batch_size) + + print(doc) diff --git a/cube/networks/tokenizer.py b/cube/networks/tokenizer.py new file mode 100644 index 000000000..7398bc2dd --- /dev/null +++ b/cube/networks/tokenizer.py @@ -0,0 +1,467 @@ +import sys + +from cube.networks.utils import unpack, mask_concat + +sys.path.append('') +import os, argparse + +os.environ["TOKENIZERS_PARALLELISM"] = "false" +import pytorch_lightning as pl +import torch.nn as nn +import torch.nn.functional as F +import torch +from cube.io_utils.objects import Document, Sentence, Token, Word +from cube.io_utils.encodings import Encodings +from cube.io_utils.config import TokenizerConfig +from cube.networks.utils_tokenizer import TokenCollate +import numpy as np +from cube.networks.modules import ConvNorm, LinearNorm, MLP +from torch.utils.data import DataLoader +import random + +from cube.networks.modules import WordGram + + +class Tokenizer(pl.LightningModule): + def __init__(self, config: TokenizerConfig, encodings: Encodings, language_codes: [] = None, ext_word_emb=0, + max_seq_len=-1): + super().__init__() + self._language_codes = language_codes + self._config = config + self._max_seq_len = max_seq_len + if not isinstance(ext_word_emb, list): + ext_word_emb = [ext_word_emb] + self._ext_word_emb = ext_word_emb + conv_layers = [] + cs_inp = config.external_proj_size + config.lang_emb_size + 256 + 16 + NUM_FILTERS = config.cnn_filter + for _ in range(config.cnn_layers): + conv_layer = nn.Sequential( + ConvNorm(cs_inp, + NUM_FILTERS, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(NUM_FILTERS)) + conv_layers.append(conv_layer) + cs_inp = NUM_FILTERS // 2 + config.lang_emb_size + self._convs = nn.ModuleList(conv_layers) + self._wg = WordGram(len(encodings.char2int), num_langs=encodings.num_langs) + self._lang_emb = nn.Embedding(encodings.num_langs + 1, config.lang_emb_size, padding_idx=0) + self._spa_emb = nn.Embedding(3, 16, padding_idx=0) + self._output = LinearNorm(NUM_FILTERS // 2 + config.lang_emb_size, 5) + + ext2int = [] + for input_size in self._ext_word_emb: + module = MLP(input_size, config.external_proj_size) + ext2int.append(module) + self._ext_proj = nn.ModuleList(ext2int) + + if self._language_codes: # only for training + self._dev_results = {i: [] for i, _ in enumerate(self._language_codes)} + + self._res = {} + for language_code in self._language_codes: + self._res[language_code] = {"sent": 0., "token": 0.} + self._early_stop_meta_val = 0 + self._epoch_results = {} + + def forward(self, batch): + x_emb = batch['x_input'] + x_spa = batch['x_input_spa'] + x_lang = batch['x_lang'] + x_lang = self._lang_emb(x_lang).unsqueeze(1).repeat(1, x_emb[0].shape[1], 1) + x_word_char = batch['x_word_char'] + x_word_case = batch['x_word_case'] + x_word_lang = batch['x_word_lang'] + x_word_masks = batch['x_word_masks'] + x_word_len = batch['x_word_len'] + x_sent_len = batch['x_sent_len'] + char_emb_packed = self._wg(x_word_char, x_word_case, x_word_lang, x_word_masks, x_word_len) + + sl = x_sent_len.cpu().numpy() + + x_char_emb = unpack(char_emb_packed, sl, x_emb[0].shape[1], device=self._get_device()) + + word_emb_ext = None + + for ii in range(len(x_emb)): + we = x_emb[ii] + if word_emb_ext is None: + word_emb_ext = self._ext_proj[ii](we.float().to(self._get_device())) + else: + word_emb_ext = word_emb_ext + self._ext_proj[ii](we) + + word_emb_ext = word_emb_ext / len(x_emb) + word_emb_ext = torch.tanh(word_emb_ext) + x_emb = word_emb_ext + x_spa_emb = self._spa_emb(x_spa) + x_emb = mask_concat([x_emb, x_char_emb], 0.33, self.training, self._get_device()) + x_emb = torch.cat([x_emb, x_spa_emb], dim=-1) + x = torch.cat([x_emb, x_lang], dim=-1).permute(0, 2, 1) + x_lang = x_lang.permute(0, 2, 1) + half = self._config.cnn_filter // 2 + res = None + cnt = 0 + for conv in self._convs: + conv_out = conv(x) + tmp = torch.tanh(conv_out[:, :half, :]) * torch.sigmoid((conv_out[:, half:, :])) + if res is None: + res = tmp + else: + res = res + tmp + x = torch.dropout(tmp, 0.2, self.training) + cnt += 1 + if cnt != self._config.cnn_layers: + x = torch.cat([x, x_lang], dim=1) + x = x + res + x = torch.cat([x, x_lang], dim=1) + x = x.permute(0, 2, 1) + return self._output(x) + + def validation_step(self, batch, batch_idx): + if batch['x_input'] is None: + print("Return 0") + return None + x_lang = batch['x_lang'] + x_text = batch['x_text'] + y_offset = batch['y_offset'].cpu().numpy() + y_target = batch['y_output'].cpu().numpy() + y_len = batch['y_len'].cpu().numpy() + x_l = x_lang.cpu().numpy() + y_pred = self.forward(batch) + y_pred = torch.argmax(y_pred, dim=-1).detach().cpu().numpy() + for ii in range(len(y_len)): + ofs = y_offset[ii] + lang = x_l[ii] - 1 + for jj in range(y_len[ii]): + self._dev_results[lang].append([x_text[ii][jj], y_target[ii, jj + ofs], y_pred[ii, jj + ofs]]) + + def validation_epoch_end(self, outputs) -> None: + # empty accumulator + # results = {langid: {'SENT_F': 0, 'TOK_F': 0} for langid in self._id2lang} + results = {} + + for lang in self._dev_results: + data = self._dev_results[lang] + g_sents = [] + p_sents = [] + tok_p = '' + tok_g = '' + g_sent = [] + p_sent = [] + for example in data: + target = example[1] + pred = example[2] + text = example[0].replace('▁', '') + tok_g += text + tok_p += text + if target == 2 or target == 3 or target == 4: + if tok_g.strip() != '': + g_sent.append(tok_g) + tok_g = '' + if target == 4: + if len(g_sent) != 0: + g_sents.append(g_sent) + g_sent = [] + + if pred == 2 or pred == 3 or pred == 4: + if tok_p.strip() != '': + p_sent.append(tok_p) + tok_p = '' + if pred == 4: + if len(p_sent) != 0: + p_sents.append(p_sent) + p_sent = [] + + if tok_g.strip() != '': + g_sent.append(tok_g) + if len(g_sent) != 0: + g_sents.append(g_sent) + if tok_p.strip() != '': + p_sent.append(tok_p) + if len(p_sent) != 0: + p_sents.append(p_sent) + + sent_f, tok_f = _conll_eval(g_sents, p_sents) + + if self._language_codes is not None: + lang = self._language_codes[lang] + results[lang] = {} + results[lang]['sent'] = sent_f + results[lang]['token'] = tok_f + self.log('val/SENT/{0}'.format(lang), sent_f) + self.log('val/TOKEN/{0}'.format(lang), tok_f) + + self._dev_results = {i: [] for i, _ in enumerate(self._language_codes)} + self._epoch_results = self._compute_early_stop(results) + self.log('val/early_meta', self._early_stop_meta_val) + + def training_step(self, batch, batch_idx): + if batch['x_input'] is None: + print("Return 0") + return None + + y_target = batch['y_output'] + if self._max_seq_len != -1 and y_target.shape[1] > self._max_seq_len: # fix for HF + return None + y_pred = self.forward(batch) + + loss = F.cross_entropy(y_pred.view(-1, y_pred.shape[2]), y_target.view(-1), ignore_index=0) + return loss + + def load(self, model_path: str, device: str = 'cpu'): + self.load_state_dict(torch.load(model_path, map_location='cpu')['state_dict']) + self.to(device) + + def process(self, raw_text, collate: TokenCollate, batch_size=32, num_workers: int = 4, lang_id: int = 0): + raw_text = raw_text.replace('\n', ' ').replace('\r', ' ') + new_text = raw_text.replace(' ', ' ') + while new_text != raw_text: + raw_text = new_text + new_text = raw_text.replace(' ', ' ') + + self.eval() + from cube.networks.utils_tokenizer import TokenDatasetLive + dataset = TokenDatasetLive(raw_text, collate.get_tokens) + collate._lang_id = lang_id + dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate.collate_fn, + shuffle=False, num_workers=num_workers) + + toks = [] + preds = [] + import tqdm + for batch in dataloader: # tqdm.tqdm(dataloader): + for key in batch: + if isinstance(batch[key], torch.Tensor): + batch[key] = batch[key].to(self._device) + + x_text = batch['x_text'] + y_offset = batch['y_offset'].cpu().numpy() + y_len = batch['y_len'].cpu().numpy() + with torch.no_grad(): + y_pred = self.forward(batch) + y_pred = torch.argmax(y_pred, dim=-1).detach().cpu().numpy() + for ii in range(len(y_len)): + ofs = y_offset[ii] + for jj in range(y_len[ii]): + toks.append(x_text[ii][jj]) + preds.append(y_pred[ii, jj + ofs]) + + p_sents = [] + tok_p = '' + p_mwes = [] + p_sent = [] + p_mwe = [] + for pred, text in zip(preds, toks): + text = text.replace('▁', '') + tok_p += text + + if pred == 2 or pred == 3 or pred == 4: + if tok_p.strip() != '': + p_sent.append(tok_p) + if pred == 3: + p_mwe.append(True) + else: + p_mwe.append(False) + tok_p = '' + if pred == 4: + if len(p_sent) != 0: + p_sents.append(p_sent) + p_mwes.append(p_mwe) + p_sent = [] + p_mwe = [] + + if tok_p.strip() != '': + p_sent.append(tok_p) + p_mwe.append(False) + if len(p_sent) != 0: + p_sents.append(p_sent) + p_mwes.append(p_mwe) + + d = Document() + + for sent, mwe in zip(p_sents, p_mwes): + seq = [] + cnt = 0 + spaceafter = "_" + for w, m in zip(sent, mwe): + cnt += 1 + + seq.append(Word(cnt, w, '_', '_', '_', '_', 0, '_', '_', spaceafter)) + if m: + seq[-1].space_after += ';compund' + s = Sentence(sequence=seq, lang_id=lang_id) + + d.sentences.append(s) + return d + + def configure_optimizers(self): + return torch.optim.AdamW(self.parameters()) + + def _compute_early_stop(self, res): + for lang in res: + if res[lang]["sent"] > self._res[lang]["sent"]: + self._early_stop_meta_val += 1 + self._res[lang]["sent"] = res[lang]["sent"] + res[lang]["sent_best"] = True + if res[lang]["token"] > self._res[lang]["token"]: + self._early_stop_meta_val += 1 + self._res[lang]["token"] = res[lang]["token"] + res[lang]["token_best"] = True + return res + + def _get_device(self): + if self._lang_emb.weight.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(self._lang_emb.weight.device.type, str(self._lang_emb.weight.device.index)) + + def _detect_no_space_lang(document: Document): + seen_spc = 0 + POLL_RANGE = 50 + for ii in range(POLL_RANGE): + index = random.randint(0, len(document.sentences) - 1) + text = document.sentences[index].text.strip() + if ' ' in text: + seen_spc += 1 + if seen_spc / POLL_RANGE > 0.5: + return False + else: + return True + + class PrintAndSaveCallback(pl.callbacks.Callback): + def __init__(self, store_prefix): + super().__init__() + self.store_prefix = store_prefix + + def on_validation_end(self, trainer, pl_module): + metrics = trainer.callback_metrics + epoch = trainer.current_epoch + + # from pprint import pprint + # pprint(metrics) + for lang in pl_module._epoch_results: + res = pl_module._epoch_results[lang] + if "sent_best" in res: + trainer.save_checkpoint(self.store_prefix + "." + lang + ".sent") + if "token_best" in res: + trainer.save_checkpoint(self.store_prefix + "." + lang + ".tok") + + trainer.save_checkpoint(self.store_prefix + ".last") + + s = "{0:30s}\tSENT\tTOKEN".format("Language") + print("\n\n\t" + s) + print("\t" + ("=" * (len(s) + 9))) + for lang in pl_module._language_codes: + sent = metrics["val/SENT/{0}".format(lang)] + token = metrics["val/TOKEN/{0}".format(lang)] + msg = "\t{0:30s}:\t{1:.4f}\t{2:.4f}".format(lang, sent, token) + print(msg) + print("\n") + + +def _conll_eval(gold, pred): + f = open('tmp_g.txt', 'w') + for sent in gold: + for ii in range(len(sent)): + head = ii + f.write('{0}\t{1}\t_\t_\t_\t_\t{2}\t_\t_\t_\n'.format(ii + 1, sent[ii], head)) + f.write('\n') + f.close() + + f = open('tmp_p.txt', 'w') + for sent in pred: + for ii in range(len(sent)): + head = ii + f.write('{0}\t{1}\t_\t_\t_\t_\t{2}\t_\t_\t_\n'.format(ii + 1, sent[ii], head)) + f.write('\n') + f.close() + from _cube.misc.conll18_ud_eval_wrapper import conll_eval + result = conll_eval('tmp_g.txt', 'tmp_p.txt') + if result is None: + return 0, 0 + else: + return result['Sentences'].f1, result['Tokens'].f1 + + +""" +if __name__ == '__main__': + from cube.io_utils.misc import ArgParser + + argparser = ArgParser() + # run argparser + args = argparser() + print(args) # example + + import json + + langs = json.load(open(args.train_file)) + doc_train = Document() + doc_dev = Document() + id2lang = {} + for ii in range(len(langs)): + lang = langs[ii] + print(lang[1], ii) + doc_train.load(lang[1], lang_id=ii) + doc_dev.load(lang[2], lang_id=ii) + id2lang[ii] = lang[0] + + # ensure target dir exists + target = args.store + i = args.store.rfind("/") + if i > 0: + target = args.store[:i] + os.makedirs(target, exist_ok=True) + + enc = Encodings() + enc.compute(doc_train, None) + enc.save('{0}.encodings'.format(args.store)) + + config = TokenizerConfig() + no_space_lang = _detect_no_space_lang(doc_train) + print("NO_SPACE_LANG = " + str(no_space_lang)) + config.no_space_lang = no_space_lang + config.lm_model = args.lm_model + if args.config_file: + config.load(args.config_file) + if args.lm_model is not None: + config.lm_model = args.lm_model + config.save('{0}.config'.format(args.store)) + + # helper = LMHelper(device=args.lm_device, model=config.lm_model) + # helper.apply(doc_dev) + # helper.apply(doc_train) + trainset = TokenizationDataset(doc_train) + devset = TokenizationDataset(doc_dev, shuffle=False) + collate = TokenCollate(enc, lm_device=args.lm_device, lm_model=args.lm_model, no_space_lang=config.no_space_lang) + train_loader = DataLoader(trainset, batch_size=args.batch_size, collate_fn=collate.collate_fn, shuffle=True, + num_workers=args.num_workers) + val_loader = DataLoader(devset, batch_size=args.batch_size, collate_fn=collate.collate_fn, + num_workers=args.num_workers) + + model = Tokenizer(config=config, encodings=enc, id2lang=id2lang) + + # training + + early_stopping_callback = EarlyStopping( + monitor='val/early_meta', + patience=args.patience, + verbose=True, + mode='max' + ) + if args.gpus == 0: + acc = 'ddp_cpu' + else: + acc = 'ddp' + trainer = pl.Trainer( + gpus=args.gpus, + accelerator=acc, + num_nodes=1, + default_root_dir='data/', + callbacks=[early_stopping_callback, PrintAndSaveCallback(args, id2lang)], + # limit_train_batches=5, + # limit_val_batches=2, + ) + + trainer.fit(model, train_loader, val_loader) +""" diff --git a/cube/networks/utils.py b/cube/networks/utils.py new file mode 100644 index 000000000..290d7e1b1 --- /dev/null +++ b/cube/networks/utils.py @@ -0,0 +1,575 @@ +import sys +import random +from abc import abstractmethod + +sys.path.append('') +import numpy as np +import torch +from torch.utils.data.dataset import Dataset + +from cube.io_utils.objects import Document, Sentence, Token, Word +from cube.io_utils.encodings import Encodings + +from collections import namedtuple +from transformers import AutoModel, AutoTokenizer + + +def unpack(data: torch.Tensor, sizes: [], max_size: int, device: str): + pos = 0 + blist = [] + for ii in range(len(sizes)): + slist = [] + for jj in range(sizes[ii]): + slist.append(data[pos, :].unsqueeze(0).to(device)) + pos += 1 + + for jj in range(max_size - sizes[ii]): + slist.append(torch.zeros((1, data.shape[-1]), device=device, dtype=torch.float)) + slist = torch.cat(slist, dim=0) + blist.append(slist.unsqueeze(0)) + + blist = torch.cat(blist, dim=0) + return blist + + +def mask_concat(representations, drop_prob: float, training: bool, device: str): + if training: + masks = [] + for ii in range(len(representations)): + mask = np.ones((representations[ii].shape[0], representations[ii].shape[1]), dtype=np.long) + masks.append(mask) + + for ii in range(masks[0].shape[0]): + for jj in range(masks[0].shape[1]): + mult = 1 + for kk in range(len(masks)): + p = random.random() + if p < drop_prob: + mult += 1 + masks[kk][ii, jj] = 0 + for kk in range(len(masks)): + masks[kk][ii, jj] *= mult + for kk in range(len(masks)): + masks[kk] = torch.tensor(masks[kk], device=device) + + for kk in range(len(masks)): + representations[kk] = representations[kk] * masks[kk].unsqueeze(2) + + return torch.cat(representations, dim=-1) + + +class TokenizationDataset(Dataset): + def __init__(self, document: Document, shuffle=True): + self._document = document + self._shuffle = shuffle + + def __len__(self): + return len(self._document.sentences) + + def __getitem__(self, item): + # append two random sentences + if self._shuffle: + index1 = random.randint(0, len(self._document.sentences) - 1) + index2 = random.randint(0, len(self._document.sentences) - 1) + else: + index1 = item - 1 + index2 = item + 1 + if index1 >= 0: + prev = self._document.sentences[index1] + else: + prev = Sentence(sequence=[]) + if index2 < len(self._document.sentences): + next = self._document.sentences[index2] + else: + next = Sentence(sequence=[]) + return {'main': self._document.sentences[item], 'prev': prev, 'next': next} + + +class MorphoDataset(Dataset): + def __init__(self, document: Document): + self._document = document + + def __len__(self): + return len(self._document.sentences) + + def __getitem__(self, item): + return self._document.sentences[item] + + +class LemmaDataset(Dataset): + def __init__(self, document: Document, for_training=True): + self._examples = [] + lookup = {} + for sent in document.sentences: + lang_id = sent.lang_id + for w in sent.words: + word = w.word + lemma = w.lemma + upos = w.upos + + key = (word, lang_id, upos) + if key not in lookup or for_training is False: + lookup[key] = 1 + example = {'word': word, 'upos': upos, 'lang_id': lang_id, 'target': lemma} + self._examples.append(example) + + def __len__(self): + return len(self._examples) + + def __getitem__(self, item): + return self._examples[item] + + +class CompoundDataset(Dataset): + def __init__(self, document: Document, for_training=True): + self._examples = [] + lookup = {} + for sent in document.sentences: + lang_id = sent.lang_id + if for_training is True: + for t in sent.tokens: + if len(t.words) > 1: + word = t.text + target = ' '.join([w.word for w in t.words]) + key = (word, lang_id) + # if key not in lookup: + lookup[key] = 1 + example = {'word': word, 'lang_id': lang_id, 'target': target} + self._examples.append(example) + else: + for t in sent.tokens: + example = {'word': t.words[0].word, 'lang_id': lang_id, 'target': ""} + self._examples.append(example) + + def __len__(self): + return len(self._examples) + + def __getitem__(self, item): + return self._examples[item] + + +class Word2TargetCollate: + def __init__(self, encodings: Encodings): + self._encodings = encodings + self._start_index = len(encodings.char2int) + self._stop_index = len(encodings.char2int) + 1 + self._eol = len(encodings.char2int) + + def collate_fn(self, batch): + max_input_len = max([len(e['word']) for e in batch]) + max_target_len = max([len(e['target']) for e in batch]) + n = len(batch) + x_char = np.zeros((n, max_input_len + 2), dtype=np.long) + x_case = np.zeros((n, max_input_len + 2), dtype=np.long) + y_char = np.zeros((n, max_target_len + 1), dtype=np.long) + y_case = np.zeros((n, max_target_len + 1), dtype=np.long) + x_lang = np.zeros(n, dtype=np.long) + x_upos = np.zeros(n, dtype=np.long) + + for ii in range(n): + word = batch[ii]['word'] + target = batch[ii]['target'] + lang_id = batch[ii]['lang_id'] + upos = '' + if 'upos' in batch[ii]: + upos = batch[ii]['upos'] + x_lang[ii] = lang_id + 1 + if upos in self._encodings.upos2int: + x_upos[ii] = self._encodings.upos2int[upos] + else: + x_upos[ii] = self._encodings.upos2int[''] + + i_char = 1 + x_char[ii, 0] = self._start_index + for ch in word: + if ch.lower() == ch.upper(): + x_case[ii, i_char] = 1 # symbol + elif ch.lower() != ch: + x_case[ii, i_char] = 2 # uppercase + else: + x_case[ii, i_char] = 3 # lowercase + if ch.lower() in self._encodings.char2int: + x_char[ii, i_char] = self._encodings.char2int[ch.lower()] + else: + x_char[ii, i_char] = self._encodings.char2int[''] + i_char += 1 + x_char[ii, i_char] = self._stop_index + + i_char = 0 + for ch in target: + if ch.lower() == ch.upper(): + y_case[ii, i_char] = 1 # symbol + elif ch.lower() != ch: + y_case[ii, i_char] = 2 # uppercase + else: + y_case[ii, i_char] = 3 # lowercase + if ch.lower() in self._encodings.char2int: + y_char[ii, i_char] = self._encodings.char2int[ch.lower()] + else: + y_char[ii, i_char] = self._encodings.char2int[''] + i_char += 1 + y_char[ii, i_char] = self._eol + + rez = { + 'x_char': torch.tensor(x_char), + 'x_case': torch.tensor(x_case), + 'x_lang': torch.tensor(x_lang), + 'x_upos': torch.tensor(x_upos), + 'y_char': torch.tensor(y_char), + 'y_case': torch.tensor(y_case) + } + return rez + + +class MorphoCollate: + def __init__(self, encodings: Encodings, add_parsing=False, rhl_win_size=7): + self._encodings = encodings + self._add_parsing = add_parsing + self._rhl_win_size = rhl_win_size + + def collate_fn(self, batch: [Sentence]): + a_sent_len = [len(sent.words) for sent in batch] + a_word_len = [] + + x_word_embeddings = [[] for _ in range(len(batch[0].words[0].emb))] + for sent in batch: + for word in sent.words: + a_word_len.append(len(word.word)) + for ii in range(len(word.emb)): + x_word_embeddings[ii].append(word.emb[ii]) + x_sent_len = np.array(a_sent_len, dtype=np.long) + x_word_len = np.array(a_word_len, dtype=np.long) + max_sent_len = np.max(x_sent_len) + max_word_len = np.max(x_word_len) + x_sent_masks = np.zeros((len(batch), max_sent_len), dtype=np.float) + x_word_masks = np.zeros((x_word_len.shape[0], max_word_len), dtype=np.float) + + x_sent = np.zeros((len(batch), max_sent_len), dtype=np.long) + x_word = np.zeros((x_word_len.shape[0], max_word_len), dtype=np.long) + x_word_case = np.zeros((x_word_len.shape[0], max_word_len), dtype=np.long) + c_word = 0 + x_lang_sent = np.zeros((len(batch)), dtype=np.long) + x_lang_word = [] + + y_upos = np.zeros((x_sent.shape[0], x_sent.shape[1]), dtype=np.long) + y_xpos = np.zeros((x_sent.shape[0], x_sent.shape[1]), dtype=np.long) + y_attrs = np.zeros((x_sent.shape[0], x_sent.shape[1]), dtype=np.long) + + y_head = np.zeros((x_sent.shape[0], x_sent.shape[1]), dtype=np.long) + y_label = np.zeros((x_sent.shape[0], x_sent.shape[1]), dtype=np.long) + y_rhl = np.zeros((x_sent.shape[0], x_sent.shape[1]), dtype=np.long) + + for iSent in range(len(batch)): + sent = batch[iSent] + x_lang_sent[iSent] = sent.lang_id + 1 + for iWord in range(len(sent.words)): + word = sent.words[iWord] + y_head[iSent, iWord] = word.head + rhl = word.head - iWord + self._rhl_win_size + rhl = np.clip(rhl, 0, self._rhl_win_size * 2 - 1) + y_rhl[iSent, iWord] = rhl + + if word.label in self._encodings.label2int: + y_label[iSent, iWord] = self._encodings.label2int[word.label] + if word.upos in self._encodings.upos2int: + y_upos[iSent, iWord] = self._encodings.upos2int[word.upos] + if word.xpos in self._encodings.xpos2int: + y_xpos[iSent, iWord] = self._encodings.xpos2int[word.xpos] + if word.attrs in self._encodings.attrs2int: + y_attrs[iSent, iWord] = self._encodings.attrs2int[word.attrs] + word = sent.words[iWord].word + x_sent_masks[iSent, iWord] = 1 + w = word.lower() + x_lang_word.append(sent.lang_id + 1) + if w in self._encodings.word2int: + x_sent[iSent, iWord] = self._encodings.word2int[w] + else: + x_sent[iSent, iWord] = self._encodings.word2int[''] + + for iChar in range(len(word)): + x_word_masks[c_word, iChar] = 1 + ch = word[iChar] + if ch.lower() == ch.upper(): # symbol + x_word_case[c_word, iChar] = 1 + elif ch.lower() != ch: # upper + x_word_case[c_word, iChar] = 2 + else: # lower + x_word_case[c_word, iChar] = 3 + ch = ch.lower() + if ch in self._encodings.char2int: + x_word[c_word, iChar] = self._encodings.char2int[ch] + else: + x_word[c_word, iChar] = self._encodings.char2int[''] + c_word += 1 + + x_lang_word = np.array(x_lang_word) + x_word_embeddings = [torch.tensor(t) for t in x_word_embeddings] + response = { + 'x_sent': torch.tensor(x_sent), + 'x_lang_sent': torch.tensor(x_lang_sent), + 'x_word': torch.tensor(x_word), + 'x_word_case': torch.tensor(x_word_case), + 'x_lang_word': torch.tensor(x_lang_word), + 'x_sent_len': torch.tensor(x_sent_len), + 'x_word_len': torch.tensor(x_word_len), + 'x_sent_masks': torch.tensor(x_sent_masks), + 'x_word_masks': torch.tensor(x_word_masks), + 'x_word_embeddings': x_word_embeddings, + 'y_upos': torch.tensor(y_upos), + 'y_xpos': torch.tensor(y_xpos), + 'y_attrs': torch.tensor(y_attrs) + } + + if self._add_parsing: + response['y_head'] = torch.tensor(y_head) + response['y_label'] = torch.tensor(y_label) + response['y_rhl'] = torch.tensor(y_rhl) + + return response + + +Arc = namedtuple('Arc', ('tail', 'weight', 'head')) + + +class GreedyDecoder: + def _valid(self, arc, tree): + # just one head + for sa in tree: + if sa.tail == arc.tail: + return False + stack = [arc.head] + pos = 0 + used = [False] * len(tree) + while pos < len(stack): + for zz in range(len(tree)): + if tree[zz].tail == stack[pos] and not used[zz]: + used[zz] = True + stack.append(tree[zz].head) + if tree[zz].head == arc.tail: + return False + pos += 1 + # print pos,len(stack) + return True + + def _get_sort_key(self, item): + return item.weight + + def _greedy_tree(self, arcs): + arcs = sorted(arcs, key=self._get_sort_key, reverse=True) + # print arcs + final_tree = [] + for index in range(len(arcs)): + if self._valid(arcs[index], final_tree): + final_tree.append(arcs[index]) + # print arcs[index] + return final_tree + + def _make_ordered_list(self, tree, nWords): + lst = [0] * nWords # np.zeros(nWords) + for arc in tree: + # arc = tree[index] + tail = arc.tail + head = arc.head + lst[tail] = head + return lst[1:] + + def decode(self, score, lens): + best_tree_list = [] + for ii in range(score.shape[0]): + # norm_score = score[ii, :lens[ii], :lens[ii]] + norm_score = np.zeros((lens[ii] + 1, lens[ii] + 1)) + for wii in range(lens[ii]): + for wjj in range(lens[ii] + 1): + norm_score[wii + 1, wjj] = score[ii, wii, wjj] + nWords = norm_score.shape[0] # len(norm_score) + g = [] + for iSrc in range(1, nWords): + for iDst in range(1, nWords): + if iDst != iSrc: + a = Arc(iSrc, norm_score[iSrc][iDst], iDst) + g.append(a) + tree = self._greedy_tree(g) + best_tree = self._make_ordered_list(tree, nWords) + best_tree_list.append(best_tree) + return best_tree_list + + +# Code adapted from https://github.com/tdozat/Parser-v3/blob/master/scripts/chuliu_edmonds.py +class ChuLiuEdmondsDecoder: + def __init__(self): + pass + + def _tarjan(self, tree): + indices = -np.ones_like(tree) + lowlinks = -np.ones_like(tree) + onstack = np.zeros_like(tree, dtype=bool) + stack = list() + _index = [0] + cycles = [] + + def strong_connect(i): + _index[0] += 1 + index = _index[-1] + indices[i] = lowlinks[i] = index - 1 + stack.append(i) + onstack[i] = True + dependents = np.where(np.equal(tree, i))[0] + for j in dependents: + if indices[j] == -1: + strong_connect(j) + lowlinks[i] = min(lowlinks[i], lowlinks[j]) + elif onstack[j]: + lowlinks[i] = min(lowlinks[i], indices[j]) + + # There's a cycle! + if lowlinks[i] == indices[i]: + cycle = np.zeros_like(indices, dtype=bool) + while stack[-1] != i: + j = stack.pop() + onstack[j] = False + cycle[j] = True + stack.pop() + onstack[i] = False + cycle[i] = True + if cycle.sum() > 1: + cycles.append(cycle) + return + + for i in range(len(tree)): + if indices[i] == -1: + strong_connect(i) + return cycles + + def _chuliu_edmonds(self, scores): + scores *= (1 - np.eye(scores.shape[0])) + scores[0] = 0 + scores[0, 0] = 1 + tree = np.argmax(scores, axis=1) + cycles = self._tarjan(tree) + if not cycles: + return tree + else: + # t = len(tree); c = len(cycle); n = len(noncycle) + # locations of cycle; (t) in [0,1] + cycle = cycles.pop() + # indices of cycle in original tree; (c) in t + cycle_locs = np.where(cycle)[0] + # heads of cycle in original tree; (c) in t + cycle_subtree = tree[cycle] + # scores of cycle in original tree; (c) in R + cycle_scores = scores[cycle, cycle_subtree] + # total score of cycle; () in R + cycle_score = cycle_scores.prod() + + # locations of noncycle; (t) in [0,1] + noncycle = np.logical_not(cycle) + # indices of noncycle in original tree; (n) in t + noncycle_locs = np.where(noncycle)[0] + # print(cycle_locs, noncycle_locs) + + # scores of cycle's potential heads; (c x n) - (c) + () -> (n x c) in R + metanode_head_scores = scores[cycle][:, noncycle] / cycle_scores[:, None] * cycle_score + # scores of cycle's potential dependents; (n x c) in R + metanode_dep_scores = scores[noncycle][:, cycle] + # best noncycle head for each cycle dependent; (n) in c + metanode_heads = np.argmax(metanode_head_scores, axis=0) + # best cycle head for each noncycle dependent; (n) in c + metanode_deps = np.argmax(metanode_dep_scores, axis=1) + + # scores of noncycle graph; (n x n) in R + subscores = scores[noncycle][:, noncycle] + # pad to contracted graph; (n+1 x n+1) in R + subscores = np.pad(subscores, ((0, 1), (0, 1)), 'constant') + # set the contracted graph scores of cycle's potential heads; (c x n)[:, (n) in n] in R -> (n) in R + subscores[-1, :-1] = metanode_head_scores[metanode_heads, np.arange(len(noncycle_locs))] + # set the contracted graph scores of cycle's potential dependents; (n x c)[(n) in n] in R-> (n) in R + subscores[:-1, -1] = metanode_dep_scores[np.arange(len(noncycle_locs)), metanode_deps] + + # MST with contraction; (n+1) in n+1 + contracted_tree = self._chuliu_edmonds(subscores) + # head of the cycle; () in n + # print(contracted_tree) + cycle_head = contracted_tree[-1] + # fixed tree: (n) in n+1 + contracted_tree = contracted_tree[:-1] + # initialize new tree; (t) in 0 + new_tree = -np.ones_like(tree) + # print(0, new_tree) + # fixed tree with no heads coming from the cycle: (n) in [0,1] + contracted_subtree = contracted_tree < len(contracted_tree) + # add the nodes to the new tree (t)[(n)[(n) in [0,1]] in t] in t = (n)[(n)[(n) in [0,1]] in n] in t + new_tree[noncycle_locs[contracted_subtree]] = noncycle_locs[contracted_tree[contracted_subtree]] + # print(1, new_tree) + # fixed tree with heads coming from the cycle: (n) in [0,1] + contracted_subtree = np.logical_not(contracted_subtree) + # add the nodes to the tree (t)[(n)[(n) in [0,1]] in t] in t = (c)[(n)[(n) in [0,1]] in c] in t + new_tree[noncycle_locs[contracted_subtree]] = cycle_locs[metanode_deps[contracted_subtree]] + # print(2, new_tree) + # add the old cycle to the tree; (t)[(c) in t] in t = (t)[(c) in t] in t + new_tree[cycle_locs] = tree[cycle_locs] + # print(3, new_tree) + # root of the cycle; (n)[() in n] in c = () in c + cycle_root = metanode_heads[cycle_head] + # add the root of the cycle to the new tree; (t)[(c)[() in c] in t] = (c)[() in c] + new_tree[cycle_locs[cycle_root]] = noncycle_locs[cycle_head] + # print(4, new_tree) + return new_tree + + def _chuliu_edmonds_one_root(self, scores): + """""" + + scores = scores.astype(np.float64) + tree = self._chuliu_edmonds(scores) + roots_to_try = np.where(np.equal(tree[1:], 0))[0] + 1 + if len(roots_to_try) == 1: + return tree + + # Look at all roots that are more likely than we would expect + if len(roots_to_try) == 0: + roots_to_try = np.where(scores[1:, 0] >= 1 / len(scores))[0] + 1 + # *sigh* just grab the most likely one + if len(roots_to_try) == 0: + roots_to_try = np.array([np.argmax(scores[1:, 0]) + 1]) + + # ------------------------------------------------------------- + def set_root(scores, root): + root_score = scores[root, 0] + scores = np.array(scores) + scores[1:, 0] = 0 + scores[root] = 0 + scores[root, 0] = 1 + return scores, root_score + + # ------------------------------------------------------------- + + best_score, best_tree = -np.inf, None # This is what's causing it to crash + for root in roots_to_try: + _scores, root_score = set_root(scores, root) + _tree = self._chuliu_edmonds(_scores) + tree_probs = _scores[np.arange(len(_scores)), _tree] + tree_score = np.log(tree_probs).sum() + np.log(root_score) if tree_probs.all() else -np.inf + if tree_score > best_score: + best_score = tree_score + best_tree = _tree + try: + assert best_tree is not None + except: + with open('debug.log', 'w') as f: + f.write('{}: {}, {}\n'.format(tree, scores, roots_to_try)) + f.write('{}: {}, {}, {}\n'.format(_tree, _scores, tree_probs, tree_score)) + raise + return best_tree + + def decode(self, score, lens): + best_tree_list = [] + for ii in range(score.shape[0]): + # norm_score = score[ii, :lens[ii], :lens[ii]] + norm_score = np.zeros((lens[ii] + 1, lens[ii] + 1)) + for wii in range(lens[ii]): + for wjj in range(lens[ii] + 1): + norm_score[wii + 1, wjj] = score[ii, wii, wjj] + nWords = norm_score.shape[0] # len(norm_score) + norm_score *= (1 - np.eye(nWords)) + tree = self._chuliu_edmonds_one_root(norm_score) + best_tree_list.append(tree[1:]) + return best_tree_list diff --git a/cube/networks/utils_tokenizer.py b/cube/networks/utils_tokenizer.py new file mode 100644 index 000000000..4879c62ba --- /dev/null +++ b/cube/networks/utils_tokenizer.py @@ -0,0 +1,628 @@ +import sys +import torch +import numpy as np + +sys.path.append('') +from typing import * +from abc import abstractmethod +from transformers import AutoModel, AutoTokenizer +from cube.io_utils.encodings import Encodings +from cube.io_utils.objects import Sentence +from cube.networks.lm import LMHelperLanguasito, LMHelperFT +from torch.utils.data.dataset import Dataset + + +class TokenCollate: + def __init__(self): + pass + + @abstractmethod + def collate_fn(self, batch): + pass + + @abstractmethod + def get_embeddings_size(self) -> int: + pass + + @abstractmethod + def collate_fn_live(self, text, lang_id: int, batch_size: int): + pass + + +class LanguasitoTokenizer: + def __init__(self, no_space_language=False): + self._no_space_language = no_space_language + + def __call__(self, text): + if self._no_space_language: + return [ch for ch in text] + else: + toks = [] + tok = '' + for ch in text: + if not ch.isalnum() or ch == ' ': + tok = tok.strip() + if len(tok) != 0: + toks.append(tok) + tok = '' + if ch != ' ': + toks.append(ch) + else: + tok += ch + if tok.strip() != '': + toks.append(tok) + + return toks + + +def _make_example_from_raw(toks, iBatch, seq_len, overlap): + batch = [] + num_batches = len(toks[0]) // seq_len + if len(toks[0]) % seq_len != 0: + num_batches += 1 + start = iBatch * seq_len + stop = min(iBatch * seq_len + seq_len, len(toks[0])) + current = toks[0][start:stop] + left = max(0, start - overlap) + right = min(len(toks[0]), stop + overlap) + prev = toks[0][left:start] + next = toks[0][stop:right] + if len(toks) == 2: # TF or languasito + current_spa = toks[1][start:stop] + prev_spa = toks[1][left:start] + next_spa = toks[1][stop:right] + prev = (prev, prev_spa) + main = (current, current_spa) + next = (next, next_spa) + else: + current_spa = toks[2][start:stop] + prev_spa = toks[2][left:start] + next_spa = toks[2][stop:right] + current_ids = toks[1][start:stop] + prev_ids = toks[1][left:start] + next_ids = toks[1][stop:right] + prev = (prev, prev_ids, prev_spa) + main = (current, current_ids, current_spa) + next = (next, next_ids, next_spa) + + # if len(prev)==0: + # prev=[''] + # if len(next)==0: + # next=[''] + example = {'prev': prev, 'main': main, 'next': next} + return example + + +class TokenDatasetLive(Dataset): + def __init__(self, raw_text, pretokenize_func, seq_len=300, overlap=100): + self._tokenize = pretokenize_func + if raw_text[-1] != ' ': + raw_text += ' ' + self._toks = pretokenize_func(raw_text) # self._tokenize(raw_text) + self._seq_len = seq_len + self._overlap = overlap + self._num_examples = len(self._toks[0]) // seq_len + if len(self._toks[0]) % seq_len != 0: + self._num_examples += 1 + + def __len__(self): + return self._num_examples + + def __getitem__(self, item): + return _make_example_from_raw(self._toks, item, self._seq_len, self._overlap) + + +class TokenCollateFTLanguasito(TokenCollate): + def __init__(self, encodings: Encodings, lm_model: str = None, lm_device: str = 'cuda:0', no_space_lang=False, + lang_id=None): + self._encodings = encodings + # from languasito.utils import LanguasitoTokenizer + self._tokenizer = LanguasitoTokenizer(no_space_language=no_space_lang) + self._emb_size = 0 + self._lm_model = lm_model + self._lm_device = lm_device + self._lang_id = lang_id + self.max_seq_len = -1 + parts = lm_model.split(':') + if parts[0] == 'fasttext': + self._lm_helper = LMHelperFT(device=lm_device, model=parts[1]) + self._emb_size = [300] + elif parts[0] == 'languasito': + self._lm_helper = LMHelperLanguasito(device=lm_device, model=parts[1]) + self._emb_size = [512] + else: + print("UserWarning: unsupported LM type for tokenizer") + + def collate_fn(self, batch): + START = 0 + END = 2 + PAD = 1 + max_x = 0 + x_input = [] + x_input_spa = [] + x_lang = [] + y_output = [] + y_offset = [] + y_len = [] + x_text = [] + x_lang_word = [] + x_sent_len = [] + + x_word_embeddings = [] + a_word_len = [] + for example in batch: + for qq in ['prev', 'main', 'next']: + sent = example[qq] + # toks, ids = self._tokenize(sent.text) + if self._lang_id is None: + toks = self._tokenizer(sent.text) + l_id = sent.lang_id + else: + toks, spa = sent + l_id = self._lang_id + for word in toks: + a_word_len.append(len(word)) + x_lang_word.append(l_id) + + x_word_len = np.array(a_word_len, dtype=np.long) + max_word_len = np.max(x_word_len) + x_word_masks = np.zeros((x_word_len.shape[0], max_word_len), dtype=np.float) + x_word = np.zeros((x_word_len.shape[0], max_word_len), dtype=np.long) + x_word_case = np.zeros((x_word_len.shape[0], max_word_len), dtype=np.long) + c_word = 0 + for example in batch: + sz = 0 + for qq in ['prev', 'main', 'next']: + sent = example[qq] + # toks, ids = self._tokenize(sent.text) + if self._lang_id is None: + toks = self._tokenizer(sent.text) + else: + toks, spa = sent + lst = toks + sz += len(lst) + for word in lst: + for iChar in range(len(word)): + x_word_masks[c_word, iChar] = 1 + ch = word[iChar] + if ch.lower() == ch.upper(): # symbol + x_word_case[c_word, iChar] = 1 + elif ch.lower() != ch: # upper + x_word_case[c_word, iChar] = 2 + else: # lower + x_word_case[c_word, iChar] = 3 + ch = ch.lower() + if ch in self._encodings.char2int: + x_word[c_word, iChar] = self._encodings.char2int[ch] + else: + x_word[c_word, iChar] = self._encodings.char2int[''] + c_word += 1 + x_sent_len.append(sz) + + for example in batch: + current_sentence = example['main'] + prev_sentence = example['prev'] + next_sentence = example['next'] + if self._lang_id is None: + x_lang.append(current_sentence.lang_id + 1) + else: + x_lang.append(self._lang_id + 1) + # toks, ids = self._tokenize(prev_sentence.text) + if self._lang_id is None: + toks, spa = self.get_tokens(prev_sentence.text) + else: + toks, spa = prev_sentence + x_prev = toks + x_prev_spa = spa + # toks, ids = self._tokenize(next_sentence.text) + if self._lang_id is None: + toks, spa = self.get_tokens(next_sentence.text) + else: + toks, spa = next_sentence + x_next = toks + x_next_spa = spa + y_offset.append(len(x_prev)) + # c_toks, ids = self._tokenize(current_sentence.text) + if self._lang_id is None: + c_toks, c_spa = self.get_tokens(current_sentence.text) + else: + c_toks, c_spa = current_sentence + x_main = c_toks + x_main_spa = c_spa + y_len.append(len(x_main)) + x_len = len(x_prev) + len(x_main) + len(x_next) + x_input.append([x_prev, x_main, x_next]) + x_input_spa.append([x_prev_spa, x_main_spa, x_next_spa]) + x_text.append(c_toks) + if self._lang_id is None: + y_output.append(self._get_targets(current_sentence)) + else: + y_output.append(np.zeros(len(c_toks))) + + if x_len > max_x: + max_x = x_len + + x_for_emb = [] + for example in x_input: + toks = example[0] + example[1] + example[2] + x_for_emb.append(toks) + + x_emb = self._lm_helper.apply_raw(x_for_emb) + max_len = max([len(x) for x in x_emb]) + x_out = np.zeros((len(x_emb), max_len, self._emb_size[0]), dtype=np.float) + x_out_spa = np.zeros((len(x_emb), max_len), dtype=np.long) + for ii in range(x_out.shape[0]): + for jj in range(x_out.shape[1]): + if jj < len(x_emb[ii]): + x_out[ii, jj, :] = x_emb[ii][jj] + + pos = 0 + spa = x_input_spa[ii][0] + for jj in range(len(spa)): + x_out_spa[ii, pos] = spa[jj] + pos += 1 + spa = x_input_spa[ii][1] + for jj in range(len(spa)): + x_out_spa[ii, pos] = spa[jj] + pos += 1 + spa = x_input_spa[ii][2] + for jj in range(len(spa)): + x_out_spa[ii, pos] = spa[jj] + pos += 1 + + y_out = np.zeros((x_out.shape[0], x_out.shape[1]), dtype=np.long) + for ii in range(x_out.shape[0]): + for jj in range(y_len[ii]): + index = y_offset[ii] + jj + y_out[ii, index] = y_output[ii][jj] + x_out = torch.tensor(x_out) + x_lang = torch.tensor(x_lang) + y_out = torch.tensor(y_out) + y_offset = torch.tensor(y_offset) + y_len = torch.tensor(y_len) + x_word = torch.tensor(x_word) + x_word_case = torch.tensor(x_word_case) + x_word_masks = torch.tensor(x_word_masks) + x_word_len = torch.tensor(x_word_len) + x_lang_word = torch.tensor(x_lang_word) + x_sent_len = torch.tensor(x_sent_len) + x_input_spa = torch.tensor(x_out_spa) + + return {'x_input': [x_out], 'x_input_spa': x_input_spa, 'x_word_char': x_word, 'x_word_case': x_word_case, + 'x_word_masks': x_word_masks, 'x_word_len': x_word_len, 'x_word_lang': x_lang_word, 'x_text': x_text, + 'x_lang': x_lang, 'y_output': y_out, 'y_offset': y_offset, 'y_len': y_len, 'x_sent_len': x_sent_len} + + def _get_targets(self, sentence: Sentence): + text = sentence.text + toks = self._tokenizer(text) + + targets = [0 for _ in range(len(toks))] + iToken = 0 + cl = 0 + for ii in range(len(targets)): + target = 1 # nothing + cl += len(toks[ii].replace(' ', '')) + if cl == len(sentence.tokens[iToken].text.replace(' ', '')): + iToken += 1 + cl = 0 + target = 2 # token + if len(sentence.tokens[iToken - 1].words) > 1: + target = 3 # multiword token + if iToken == len(sentence.tokens): + target = 4 # sentence end (+token) + for tt in range(ii, len(targets)): + targets[ii] = target + break + targets[ii] = target + return targets + + def get_tokens(self, text): + toks = self._tokenizer(text) + spa = [0 for _ in range(len(toks))] + t_pos = 0 + for ii in range(len(toks)): + t_pos += len(toks[ii]) + if t_pos < len(text) and text[t_pos] == ' ': + spa[ii] = 2 + while t_pos < len(text) and text[t_pos] == ' ': + t_pos += 1 + else: + spa[ii] = 1 + return toks, spa + + def get_embeddings_size(self) -> List[int]: + return self._emb_size + + def __getstate__(self): + state = self.__dict__.copy() + # Don't pickle baz + if "_lm_helper" in state: + del state["_lm_helper"] + return state + + def __setstate__(self, state): + self.__dict__.update(state) + parts = self._lm_model.split(':') + if parts[0] == 'fasttext': + self._lm_helper = LMHelperFT(device=self._lm_device, model=parts[1]) + self._emb_size = [300] + elif parts[0] == 'languasito': + self._lm_helper = LMHelperLanguasito(device=self._lm_device, model=parts[1]) + self._emb_size = [512] + + +class TokenCollateHF(TokenCollate): + def __init__(self, encodings: Encodings, lm_device, lm_model=None, no_space_lang=False, lang_id=None): + if lm_model is None: + lm_model = 'xlm-roberta-base' + self._encodings = encodings # this is currently not used - we keep it for future development + self._pretokenizer = LanguasitoTokenizer(no_space_language=no_space_lang) + self._tokenizer = AutoTokenizer.from_pretrained(lm_model) + self._lm = AutoModel.from_pretrained(lm_model, output_hidden_states=True) + self._lm.eval() + self._lm_device = lm_device + self._lm.to(lm_device) + self._no_space = no_space_lang + tmp = self._lm(torch.tensor([[100]], device=lm_device)) + h_state_size = tmp['hidden_states'][0].shape[-1] + self._emb_size = [h_state_size for _ in range(len(tmp['hidden_states']))] + self._lang_id = lang_id + + self.max_seq_len = self._tokenizer.model_max_length + + def get_tokens(self, text): + toks, ids = self._tokenize(text) + spa = [0 for _ in range(len(toks))] + t_pos = 0 + for ii in range(len(toks)): + t_pos += len(toks[ii].replace('▁', '')) + if t_pos < len(text) and text[t_pos] == ' ': + spa[ii] = 2 + while t_pos < len(text) and text[t_pos] == ' ': + t_pos += 1 + else: + spa[ii] = 1 + return toks, ids, spa + + def collate_fn(self, batch): + START = 0 + END = 2 + PAD = 1 + max_x = 0 + x_input = [] + x_input_spa = [] + x_lang = [] + y_output = [] + y_offset = [] + y_len = [] + x_text = [] + x_spa = [] + x_lang_word = [] + x_sent_len = [] + + x_word_embeddings = [] + a_word_len = [] + for example in batch: + for qq in ['prev', 'main', 'next']: + sent = example[qq] + if self._lang_id is None: + l_id = sent.lang_id + text = sent.text + toks, ids, spa = self.get_tokens(text) + else: + l_id = self._lang_id + if len(sent) == 3: + toks, ids, spa = sent + else: + toks, ids, spa = [], [], [] + for word in toks: + a_word_len.append(len(word)) + x_lang_word.append(l_id) + x_word_len = np.array(a_word_len, dtype=np.long) + max_word_len = np.max(x_word_len) + x_word_masks = np.zeros((x_word_len.shape[0], max_word_len), dtype=np.float) + x_word = np.zeros((x_word_len.shape[0], max_word_len), dtype=np.long) + x_word_case = np.zeros((x_word_len.shape[0], max_word_len), dtype=np.long) + c_word = 0 + for example in batch: + sz = 0 + for qq in ['prev', 'main', 'next']: + sent = example[qq] + if self._lang_id is None: + l_id = sent.lang_id + toks, ids, spa = self.get_tokens(sent.text) + else: + l_id = self._lang_id + if len(sent) == 3: + toks, ids, spa = sent + else: + toks, ids, spa = [], [], [] + + lst = toks + sz += len(lst) + for word in lst: + for iChar in range(len(word)): + x_word_masks[c_word, iChar] = 1 + ch = word[iChar] + if ch.lower() == ch.upper(): # symbol + x_word_case[c_word, iChar] = 1 + elif ch.lower() != ch: # upper + x_word_case[c_word, iChar] = 2 + else: # lower + x_word_case[c_word, iChar] = 3 + ch = ch.lower() + if ch in self._encodings.char2int: + x_word[c_word, iChar] = self._encodings.char2int[ch] + else: + x_word[c_word, iChar] = self._encodings.char2int[''] + c_word += 1 + x_sent_len.append(sz) + + for example in batch: + current_sentence = example['main'] + prev_sentence = example['prev'] + next_sentence = example['next'] + if self._lang_id is None: + x_lang.append(current_sentence.lang_id + 1) + else: + x_lang.append(self._lang_id + 1) + if self._lang_id is None: + toks, ids, spa = self.get_tokens(prev_sentence.text) + else: + if len(prev_sentence) == 3: + toks, ids, spa = prev_sentence + else: + toks, ids, spa = [], [], [] + x_prev = ids + x_prev_spa = spa + if self._lang_id is None: + toks, ids, spa = self.get_tokens(next_sentence.text) + else: + if len(next_sentence) == 3: + toks, ids, spa = next_sentence + else: + toks, ids, spa = [], [], [] + x_next = ids + x_next_spa = spa + y_offset.append(len(x_prev)) + if self._lang_id is None: + c_toks, ids, c_spa = self.get_tokens(current_sentence.text) + else: + if len(current_sentence) == 3: + c_toks, ids, c_spa = current_sentence + else: + c_toks, ids, c_spa = [], [], [] + + x_main = ids + x_main_spa = c_spa + y_len.append(len(x_main)) + x_len = len(x_prev) + len(x_main) + len(x_next) + x_input.append([x_prev, x_main, x_next]) + x_input_spa.append([x_prev_spa, x_main_spa, x_next_spa]) + x_text.append(c_toks) + x_spa.append(c_spa) + if self._lang_id is None: + y_output.append(self._get_targets(current_sentence)) + else: + y_output.append([0 for _ in range(len(c_toks))]) + if x_len > max_x: + max_x = x_len + + x_out = np.ones((len(batch), max_x), dtype=np.long) * PAD + x_out_spa = np.zeros((len(batch), max_x), dtype=np.long) + for ii in range(len(batch)): + # x_out[ii, 0] = START + pos = 0 + x = x_input[ii][0] + x_spa = x_input_spa[ii][0] + for jj in range(len(x)): + x_out[ii, pos] = x[jj] + x_out_spa[ii, pos] = x_spa[jj] + pos += 1 + x = x_input[ii][1] + x_spa = x_input_spa[ii][1] + for jj in range(len(x)): + x_out[ii, pos] = x[jj] + x_out_spa[ii, pos] = x_spa[jj] + pos += 1 + x = x_input[ii][2] + x_spa = x_input_spa[ii][2] + for jj in range(len(x)): + x_out[ii, pos] = x[jj] + x_out_spa[ii, pos] = x_spa[jj] + pos += 1 + # x_out[ii, pos] = END + + y_out = np.zeros((x_out.shape[0], x_out.shape[1]), dtype=np.long) + for ii in range(x_out.shape[0]): + for jj in range(y_len[ii]): + index = y_offset[ii] + jj + y_out[ii, index] = y_output[ii][jj] + x_out = torch.tensor(x_out, device=self._lm_device) + x_lang = torch.tensor(x_lang) + y_out = torch.tensor(y_out) + x_input_spa = torch.tensor(x_out_spa, dtype=torch.long) + y_offset = torch.tensor(y_offset) + y_len = torch.tensor(y_len) + x_word = torch.tensor(x_word) + x_word_case = torch.tensor(x_word_case) + x_word_masks = torch.tensor(x_word_masks) + x_word_len = torch.tensor(x_word_len) + x_lang_word = torch.tensor(x_lang_word) + x_sent_len = torch.tensor(x_sent_len) + with torch.no_grad(): + if x_out.size()[1] > self.max_seq_len: + # print() + # print(x_out.size()) + # hack to skip batch if len is to big + # we cannot return none because pytorch lightning will complain, so we set x_input = None and check + # it in the train_step. TODO check long string in processing input + return {'x_input': None, 'x_input_spa': x_input_spa, 'x_word_char': x_word, 'x_word_case': x_word_case, + 'x_word_masks': x_word_masks, + 'x_word_len': x_word_len, 'x_word_lang': x_lang_word, 'x_text': x_text, 'x_lang': x_lang, + 'y_output': y_out, 'y_offset': y_offset, 'y_len': y_len, 'x_sent_len': x_sent_len} + x_out = self._lm(x_out)['hidden_states'] + x_out = [t.detach() for t in x_out] + return {'x_input': x_out, 'x_input_spa': x_input_spa, 'x_word_char': x_word, 'x_word_case': x_word_case, + 'x_word_masks': x_word_masks, + 'x_word_len': x_word_len, 'x_word_lang': x_lang_word, 'x_text': x_text, 'x_lang': x_lang, + 'y_output': y_out, 'y_offset': y_offset, 'y_len': y_len, 'x_sent_len': x_sent_len} + + def _tokenize(self, text): + if self._no_space: + new_text = [ch for ch in text] + else: + new_text = self._pretokenizer(text) + # print("\n" + ("_" * 50)) + # print(new_text) + # print("_" * 50) + toks = self._tokenizer.tokenize(new_text, is_split_into_words=True) + ids = self._tokenizer(new_text, is_split_into_words=True)['input_ids'][1:-1] + r_toks = [] + r_ids = [] + strip_text = text.replace(' ', '').replace('\n', '').replace('\r', '').replace('\t', '') + strip_index = 0 + # if len(toks) != 0: # empty text + # r_toks.append(toks[0]) + # r_ids.append(ids[0]) + for ii in range(0, len(toks)): + if toks[ii] != '▁': + r_toks.append(toks[ii]) + r_ids.append(ids[ii]) + # if len(r_toks) > 509 or len(r_ids) > 509 or len(r_toks) < 1 or len(r_ids) < 1: + # print(f"\n>> text:[{text}] [{len(r_toks)}] [{len(r_ids)}]") + f_toks = [] + for tok in r_toks: + tok = tok.replace('▁', '') + s_len = len(tok) + f_toks.append(strip_text[strip_index:strip_index + s_len]) + strip_index += s_len + + return f_toks, r_ids + + def _get_targets(self, sentence: Sentence): + text = sentence.text + toks, ids = self._tokenize(text) + toks = [tok.replace('▁', '') for tok in toks] + targets = [0 for _ in range(len(toks))] + iToken = 0 + cl = 0 + for ii in range(len(targets)): + target = 1 # nothing + cl += len(toks[ii]) + if cl == len(sentence.tokens[iToken].text): + iToken += 1 + cl = 0 + target = 2 # token + if len(sentence.tokens[iToken - 1].words) > 1: + target = 3 # multiword token + if iToken == len(sentence.tokens): + target = 4 # sentence end (+token) + for tt in range(ii, len(targets)): + targets[ii] = target + break + targets[ii] = target + return targets + + def get_embeddings_size(self) -> List[int]: + return self._emb_size diff --git a/cube/trainer.py b/cube/trainer.py new file mode 100644 index 000000000..6dc0e0897 --- /dev/null +++ b/cube/trainer.py @@ -0,0 +1,279 @@ +import os, sys, yaml + +sys.path.append(".") +from audioop import add + +from pytorch_lightning.callbacks import EarlyStopping + +from argparse import ArgumentParser + +from torch.utils.data import DataLoader +import pytorch_lightning as pl + +from cube.io_utils.config import TaggerConfig, ParserConfig, TokenizerConfig, LemmatizerConfig, CompoundConfig +from cube.io_utils.encodings import Encodings +from cube.io_utils.objects import Document +from cube.networks.tokenizer import Tokenizer +from cube.networks.tagger import Tagger +from cube.networks.parser import Parser +from cube.networks.lemmatizer import Lemmatizer +from cube.networks.compound import Compound +from cube.networks.utils import MorphoDataset, MorphoCollate, TokenizationDataset, \ + Word2TargetCollate, LemmaDataset, CompoundDataset +from cube.networks.utils_tokenizer import TokenCollateHF, TokenCollateFTLanguasito +from cube.networks.lm import LMHelperFT, LMHelperHF, LMHelperLanguasito + + +class Trainer(): + def __init__(self, task: str, language_map: {}, language_codes: [], train_files: {}, dev_files: {}, test_files: {}, + args): + self.task = None + if task not in ["tokenizer", "lemmatizer", "cwe", "tagger", "parser"]: + raise Exception("Task must be one of: tokenizer, lemmatizer, cwe, tagger or parser.") + + self.store_prefix = args.store + self.language_map = language_map + self.language_codes = language_codes + self.args = args + + # TODO assert lang_id matches train + # TODO assert train files code are found in dev files + # lang_code_list = sorted(list(set(code for code in train_files ))) + + self.task = task + self.doc_train = Document() + self.doc_dev = Document() + self.doc_test = Document() + + for lang_code in train_files: + print("Reading train file for language code {} : {}".format(lang_code, train_files[lang_code])) + self.doc_train.load(train_files[lang_code], lang_id=language_codes.index(lang_code)) + for lang_code in dev_files: + print("Reading dev file for language code {} : {}".format(lang_code, dev_files[lang_code])) + self.doc_dev.load(dev_files[lang_code], lang_id=language_codes.index(lang_code)) + for lang_code in test_files: + print("Reading test file for language code {} : {}".format(lang_code, test_files[lang_code])) + self.doc_test.load(test_files[lang_code], lang_id=language_codes.index(lang_code)) + + # ensure store dir exists + i = self.store_prefix.rfind("/") + if i > 0: + if i >= len(self.store_prefix) - 1: + raise Exception( + "store_prefix is a folder; please specify the prefix of the models after the '/', like 'data/tagger'.") + + target_folder = self.store_prefix[:i] + model_prefix = self.store_prefix[i + 1:] + os.makedirs(target_folder, exist_ok=True) + print("Saving model in {}, with prefix {}".format(target_folder, model_prefix)) + else: + print("Saving model in the current folder, with prefix {}".format(self.store_prefix)) + + def fit(self): + if self.task not in ["tokenizer", "lemmatizer", "cwe", "tagger", "parser"]: + raise Exception("Task must be one of: tokenizer, lemmatizer, cwe, tagger or parser.") + + with open(self.args.store + ".yaml", 'w') as f: + yaml.dump({"language_map": self.language_map, "language_codes": self.language_codes}, f, sort_keys=True) + + enc = Encodings() + enc.compute(self.doc_train, None) + enc.save('{0}.encodings'.format(self.store_prefix)) + + if self.task == "tokenizer": + config = TokenizerConfig() + no_space_lang = Tokenizer._detect_no_space_lang(self.doc_train) + print("NO_SPACE_LANG = " + str(no_space_lang)) + config.no_space_lang = no_space_lang + if self.task == "tagger": + config = TaggerConfig() + if self.task == "lemmatizer": + config = LemmatizerConfig() + if self.task == "parser": + config = ParserConfig() + if self.task == "cwe": + config = CompoundConfig() + config.lm_model = self.args.lm_model + if self.args.config_file: + config.load(self.args.config_file) + if self.args.lm_model is not None: + config.lm_model = self.args.lm_model + config.save('{}.config'.format(self.args.store)) + + if self.task != "tokenizer" and self.task != 'lemmatizer' and self.task != 'cwe': + lm_model = config.lm_model + parts = lm_model.split(':') + if parts[0] not in ['transformer', 'fasttext', 'languasito']: + print("Error: model prefix should be in the form of transformer: fasttext: or languasito:") + sys.exit(0) + if parts[0] == 'transformer': + helper = LMHelperHF(device=self.args.lm_device, model=parts[1]) + elif parts[0] == 'fasttext': + helper = LMHelperFT(device=self.args.lm_device, model=parts[1]) + elif parts[0] == 'languasito': + helper = LMHelperLanguasito(device=self.args.lm_device, model=parts[1]) + helper.apply(self.doc_dev) + helper.apply(self.doc_train) + + if self.task == "tokenizer": + trainset = TokenizationDataset(self.doc_train) + devset = TokenizationDataset(self.doc_dev, shuffle=False) + elif self.task == 'parser' or self.task == 'tagger': + trainset = MorphoDataset(self.doc_train) + devset = MorphoDataset(self.doc_dev) + elif self.task == 'lemmatizer': + trainset = LemmaDataset(self.doc_train) + devset = LemmaDataset(self.doc_dev) + elif self.task == 'cwe': + trainset = CompoundDataset(self.doc_train) + devset = CompoundDataset(self.doc_dev) + + collate = MorphoCollate(enc) + + # per task specific settings + callbacks = [] + if self.task == "tokenizer": + early_stopping_callback = EarlyStopping( + monitor='val/early_meta', + patience=args.patience, + verbose=True, + mode='max' + ) + parts = args.lm_model.split(':') + if parts[0] == 'transformer': + collate = TokenCollateHF(enc, lm_device=args.lm_device, lm_model=parts[1], + no_space_lang=config.no_space_lang) + else: + collate = TokenCollateFTLanguasito(enc, lm_device=args.lm_device, lm_model=args.lm_model, + no_space_lang=config.no_space_lang) + + callbacks = [early_stopping_callback, Tokenizer.PrintAndSaveCallback(self.store_prefix)] + model = Tokenizer(config=config, encodings=enc, language_codes=self.language_codes, + ext_word_emb=collate.get_embeddings_size(), max_seq_len=collate.max_seq_len) + + if self.task == "tagger": + early_stopping_callback = EarlyStopping( + monitor='val/early_meta', + patience=args.patience, + verbose=True, + mode='max' + ) + callbacks = [early_stopping_callback, Tagger.PrintAndSaveCallback(self.store_prefix)] + model = Tagger(config=config, encodings=enc, language_codes=self.language_codes, + ext_word_emb=helper.get_embedding_size()) + + if self.task == "parser": + collate = MorphoCollate(enc, add_parsing=True, rhl_win_size=config.rhl_win_size) + early_stopping_callback = EarlyStopping( + monitor='val/early_meta', + patience=args.patience, + verbose=True, + mode='max' + ) + callbacks = [early_stopping_callback, Parser.PrintAndSaveCallback(self.store_prefix)] + model = Parser(config=config, encodings=enc, language_codes=self.language_codes, + ext_word_emb=helper.get_embedding_size()) + + if self.task == "lemmatizer": + collate = Word2TargetCollate(enc) + early_stopping_callback = EarlyStopping( + monitor='val/early_meta', + patience=args.patience, + verbose=True, + mode='max' + ) + callbacks = [early_stopping_callback, Lemmatizer.PrintAndSaveCallback(self.store_prefix)] + model = Lemmatizer(config=config, encodings=enc, language_codes=self.language_codes) + + if self.task == "cwe": + collate = Word2TargetCollate(enc) + early_stopping_callback = EarlyStopping( + monitor='val/early_meta', + patience=args.patience, + verbose=True, + mode='max' + ) + callbacks = [early_stopping_callback, Compound.PrintAndSaveCallback(self.store_prefix)] + model = Compound(config=config, encodings=enc, language_codes=self.language_codes) + # extra check to see if there is actually any compound in this language + if len(trainset._examples) == 0 or len(devset._examples) == 0: + print("\nTrain/dev data for this language does not contain any compound words; there is nothing to train.") + return + + # dataloaders + train_loader = DataLoader(trainset, batch_size=self.args.batch_size, collate_fn=collate.collate_fn, + shuffle=True, + num_workers=self.args.num_workers) + val_loader = DataLoader(devset, batch_size=self.args.batch_size, collate_fn=collate.collate_fn, + num_workers=self.args.num_workers) + + # pre-train checks + resume_from_checkpoint = None + if self.args.resume is True: + resume_from_checkpoint = self.store_prefix + ".last" + if not os.path.exists(resume_from_checkpoint): + raise Exception("Resume from checkpoint: {} not found!".format(resume_from_checkpoint)) + + """if self.args.gpus == 0: + acc = 'ddp_cpu' + else: + acc = 'ddp' + """ + + trainer = pl.Trainer( + gpus=args.gpus, + accelerator=args.accelerator, + #num_nodes=1, + default_root_dir='data/', + callbacks=callbacks, + resume_from_checkpoint=resume_from_checkpoint, + accumulate_grad_batches=args.accumulate_grad_batches, + # limit_train_batches=100, + # limit_val_batches=4, + ) + + # run fit + print("\nStarting train\n") + trainer.fit(model, train_loader, val_loader) + + +if __name__ == '__main__': + parser = ArgumentParser(description='NLP-Cube Trainer Helper') + parser.add_argument('--task', action='store', dest='task', + help='Type of task : "tokenizer", "lemmatizer", "cwe", "tagger", "parser"') + parser.add_argument('--train', action='store', dest='train_file', + help='Start building a tagger model') + parser.add_argument('--patience', action='store', type=int, default=20, dest='patience', + help='Number of epochs before early stopping (default=20)') + parser.add_argument('--store', action='store', dest='store', help='Output base', default='data/model') + parser.add_argument('--num-workers', action='store', dest='num_workers', type=int, + help='How many dataloaders to use (default=4)', default=4) + parser.add_argument('--batch-size', action='store', type=int, default=16, dest='batch_size', + help='Batch size (default=16)') + parser.add_argument('--debug', action='store_true', dest='debug', + help='Do some standard stuff to debug the model') + parser.add_argument('--resume', action='store_true', dest='resume', help='Resume training') + parser.add_argument('--lm-model', action='store', dest='lm_model', + help='What LM model to use (default=xlm-roberta-base)') + parser.add_argument('--lm-device', action='store', dest='lm_device', default='cuda:0', + help='Where to load LM (default=cuda:0)') + parser.add_argument('--config', action='store', dest='config_file', help='Load config file') + + parser = pl.Trainer.add_argparse_args(parser) # add all pytorch lightning params here as well + + args = parser.parse_args() + + with open(args.train_file) as file: + train_config = yaml.full_load(file) + + trainer_object = Trainer( + task=args.task, + language_map=train_config["language_map"], + language_codes=train_config["language_codes"], + train_files=train_config["train_files"], + dev_files=train_config["dev_files"], + test_files=train_config["test_files"], + args=args, + ) + + trainer_object.fit() diff --git a/cube/version.py b/cube/version.py new file mode 100644 index 000000000..ce3611157 --- /dev/null +++ b/cube/version.py @@ -0,0 +1 @@ +__version__="3.0" \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 64136630f..617b23838 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -7,16 +7,6 @@ RUN apt-get update && apt-get install -y build-essential automake make cmake g++ # Preparing Python build environment RUN pip3 install cython future scipy nltk requests xmltodict nose2 -# Installing MKL library -RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB && \ - apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB && \ - wget https://apt.repos.intel.com/setup/intelproducts.list -O /etc/apt/sources.list.d/intelproducts.list && \ - apt-get update && \ - apt-get install -y intel-mkl-64bit-2018.2-046 - -# Installing DyNET -RUN pip3 install dynet - # Prepare environment UTF-8 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y locales RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ @@ -30,6 +20,8 @@ ENV LC_ALL en_US.UTF-8 RUN mkdir /work && \ cd /work && \ git clone https://github.com/adobe/NLP-Cube.git + cd NLP-Cube + pip install -r requirements.txt # Prepare notebook RUN pip3 install jupyter @@ -37,5 +29,6 @@ RUN pip3 install Flask RUN pip3 install bs4 # Start webserver -CMD cd /work/NLP-Cube/cube/ && python3 webserver.py --port 8080 --lang=en --lang=fr --lang=de - +#CMD cd /work/NLP-Cube/cube/ && python3 webserver.py --port 8080 --lang=en --lang=fr --lang=de +# Start notebook +CMD cd /work/NLP-Cube/ && python3 -m "notebook" --allow-root --ip=0.0.0.0 --no-browser diff --git a/examples/catalog.json b/examples/catalog.json new file mode 100644 index 000000000..95f493bc2 --- /dev/null +++ b/examples/catalog.json @@ -0,0 +1,125 @@ +{ + "languages": [ + [ + "ca_ancora", + "ca", + "romance", + 0 + ], + [ + "fr_partut", + "fr_partut", + "romance", + 1 + ], + [ + "fro_srcmf", + "fro", + "romance", + 2 + ], + [ + "fr_ftb", + "fr_ftb", + "romance", + 3 + ], + [ + "fr_spoken", + "fr_spoken", + "romance", + 4 + ], + [ + "fr_sequoia", + "fr_sequoia", + "romance", + 5 + ], + [ + "fr_gsd", + "fr", + "romance", + 6 + ], + [ + "gl_treegal", + "gl_treegal", + "romance", + 7 + ], + [ + "gl_ctg", + "gl", + "romance", + 8 + ], + [ + "it_vit", + "it_vit", + "romance", + 9 + ], + [ + "it_postwita", + "it_postwita", + "romance", + 10 + ], + [ + "it_partut", + "it_partut", + "romance", + 11 + ], + [ + "it_twittiro", + "it_twittiro", + "romance", + 12 + ], + [ + "it_isdt", + "it", + "romance", + 13 + ], + [ + "pt_gsd", + "pt", + "romance", + 14 + ], + [ + "pt_bosque", + "pt_bosque", + "romance", + 15 + ], + [ + "ro_nonstandard", + "ro_nonstandard", + "romance", + 16 + ], + [ + "ro_rrt", + "ro", + "romance", + 17 + ], + [ + "es_ancora", + "es_ancora", + "romance", + 18 + ], + [ + "es_gsd", + "es", + "romance", + 19 + ] + ], + "base_url": "https://github.com/adobe/NLP-Cube-Models/blob/2.0/models/" +} diff --git a/examples/multilanguage/all.txt b/examples/multilanguage/all.txt new file mode 100644 index 000000000..90468ca2a --- /dev/null +++ b/examples/multilanguage/all.txt @@ -0,0 +1,92 @@ + Afrikaans 1 49K IE, Germanic + Akkadian 1 1K Afro-Asiatic, Semitic + Albanian 1 <1K IE, Albanian + Amharic 1 10K Afro-Asiatic, Semitic + Ancient_Greek 2 416K IE, Greek + Arabic 3 1,042K Afro-Asiatic, Semitic + Armenian 1 52K IE, Armenian + Assyrian 1 <1K Afro-Asiatic, Semitic + Bambara 1 13K Mande + Basque 1 121K Basque + Belarusian 1 13K IE, Slavic + Bhojpuri 2 4K IE, Indic + Breton 1 10K IE, Celtic + Bulgarian 1 156K IE, Slavic + Buryat 1 10K Mongolic + Cantonese 1 13K Sino-Tibetan + Catalan 1 531K IE, Romance + Chinese 5 285K Sino-Tibetan + Classical_Chinese 1 74K Sino-Tibetan + Coptic 1 40K Afro-Asiatic, Egyptian + Croatian 1 199K IE, Slavic + Czech 5 2,222K IE, Slavic + Danish 2 100K IE, Germanic + Dutch 2 306K IE, Germanic + English 9 620K IE, Germanic + Erzya 1 15K Uralic, Mordvin + Estonian 2 465K Uralic, Finnic + Faroese 1 10K IE, Germanic + Finnish 3 377K Uralic, Finnic + French 8 1,157K IE, Romance + Galician 2 164K IE, Romance + German 4 3,753K IE, Germanic + Gothic 1 55K IE, Germanic + Greek 1 63K IE, Greek + Hebrew 1 161K Afro-Asiatic, Semitic + Hindi 2 375K IE, Indic + Hindi_English 1 26K Indic + Hungarian 1 42K Uralic, Ugric + Indonesian 2 141K Austronesian, Malayo-Sumbawan + Irish 1 40K IE, Celtic + Italian 6 811K IE, Romance + Japanese 5 1,688K Japanese + Karelian 1 3K Uralic, Finnic + Kazakh 1 10K Turkic, Northwestern + Komi_Permyak 1 <1K Uralic, Permic + Komi_Zyrian 2 3K Uralic, Permic + Korean 5 446K Korean + Kurmanji 1 10K IE, Iranian + Latin 4 582K IE, Latin + Latvian 1 220K IE, Baltic + Lithuanian 2 75K IE, Baltic + Livvi 1 1K Uralic, Finnic + Magahi 1 7K IE, Indic + Maltese 1 44K Afro-Asiatic, Semitic + Marathi 1 3K IE, Indic + Mbya_Guarani 2 13K Tupian + Moksha 1 <1K Uralic, Mordvin + Naija 1 12K Creole + North_Sami 1 26K Uralic, Sami + Norwegian 3 666K IE, Germanic + Old_Church_Slavonic 1 57K IE, Slavic + Old_French 1 170K IE, Romance + Old_Russian 2 168K IE, Slavic + Persian 1 152K IE, Iranian + Polish 3 499K IE, Slavic + Portuguese 3 570K IE, Romance + Romanian 3 551K IE, Romance + Russian 4 1,263K IE, Slavic + Sanskrit 1 1K IE, Indic + Scottish_Gaelic 1 42K IE, Celtic + Serbian 1 97K IE, Slavic + Skolt Sami 1 <1K Uralic, Sami + Slovak 1 106K IE, Slavic + Slovenian 2 170K IE, Slavic + Spanish 3 1,004K IE, Romance + Swedish 3 206K IE, Germanic + Swedish_Sign_Language 1 1K Sign_Language + Swiss_German 1 1K IE, Germanic + Tagalog 2 <1K Austronesian, Central Philippine + Tamil 1 9K Dravidian, Southern + Telugu 1 6K Dravidian, South Central + Thai 1 22K Tai-Kadai + Turkish 4 91K Turkic, Southwestern + Ukrainian 1 122K IE, Slavic + Upper Sorbian 1 11K IE, Slavic + Urdu 1 138K IE, Indic + Uyghur 1 40K Turkic, Southeastern + Vietnamese 1 43K Austro-Asiatic, Viet-Muong + Warlpiri 1 <1K Pama-Nyungan + Welsh 1 16K IE, Celtic + Wolof 1 44K Niger-Congo, Northern Atlantic + Yoruba 1 2K Niger-Congo, Defoid diff --git a/examples/multilanguage/autogenerated/afro-asiatic.json b/examples/multilanguage/autogenerated/afro-asiatic.json new file mode 100644 index 000000000..7e292b03b --- /dev/null +++ b/examples/multilanguage/autogenerated/afro-asiatic.json @@ -0,0 +1,22 @@ +[ + [ + "ar_padt", + "corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-dev.conllu" + ], + [ + "cop_scriptorium", + "corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu" + ], + [ + "he_htb", + "corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-dev.conllu" + ], + [ + "mt_mudt", + "corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/albanian.json b/examples/multilanguage/autogenerated/albanian.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/examples/multilanguage/autogenerated/albanian.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/all.json b/examples/multilanguage/autogenerated/all.json new file mode 100644 index 000000000..8e724bc0f --- /dev/null +++ b/examples/multilanguage/autogenerated/all.json @@ -0,0 +1,507 @@ +[ + [ + "af_afribooms", + "corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu" + ], + [ + "da_ddt", + "corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-dev.conllu" + ], + [ + "nl_lassysmall", + "corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu" + ], + [ + "nl_alpino", + "corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-dev.conllu" + ], + [ + "en_lines", + "corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-dev.conllu" + ], + [ + "en_gum", + "corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-dev.conllu" + ], + [ + "en_ewt", + "corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-dev.conllu" + ], + [ + "en_partut", + "corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-dev.conllu" + ], + [ + "de_gsd", + "corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-dev.conllu" + ], + [ + "de_hdt", + "corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-dev.conllu" + ], + [ + "got_proiel", + "corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-dev.conllu" + ], + [ + "no_nynorsk", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu" + ], + [ + "no_bokmaal", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu" + ], + [ + "no_nynorsklia", + "corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu" + ], + [ + "swl_sslc", + "corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu" + ], + [ + "sv_talbanken", + "corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu" + ], + [ + "sv_lines", + "corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-dev.conllu" + ], + [ + "ar_padt", + "corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-dev.conllu" + ], + [ + "cop_scriptorium", + "corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu" + ], + [ + "he_htb", + "corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-dev.conllu" + ], + [ + "mt_mudt", + "corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu" + ], + [ + "grc_perseus", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu" + ], + [ + "grc_proiel", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu" + ], + [ + "el_gdt", + "corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-dev.conllu" + ], + [ + "hy_armtdp", + "corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu" + ], + [ + "eu_bdt", + "corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-dev.conllu" + ], + [ + "be_hse", + "corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-dev.conllu" + ], + [ + "bg_btb", + "corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-dev.conllu" + ], + [ + "hr_set", + "corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-dev.conllu" + ], + [ + "cs_pdt", + "corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-dev.conllu" + ], + [ + "cs_cltt", + "corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-dev.conllu" + ], + [ + "cs_fictree", + "corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-dev.conllu" + ], + [ + "cs_cac", + "corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-dev.conllu" + ], + [ + "cu_proiel", + "corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu" + ], + [ + "orv_torot", + "corpus/ud-treebanks-v2.5/UD_Old_Russian-TOROT/orv_torot-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Old_Russian-TOROT/orv_torot-ud-dev.conllu" + ], + [ + "pl_lfg", + "corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-dev.conllu" + ], + [ + "pl_pdb", + "corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-dev.conllu" + ], + [ + "ru_gsd", + "corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-dev.conllu" + ], + [ + "ru_syntagrus", + "corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu" + ], + [ + "ru_taiga", + "corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-dev.conllu" + ], + [ + "sr_set", + "corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-dev.conllu" + ], + [ + "sk_snk", + "corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-dev.conllu" + ], + [ + "sl_ssj", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu" + ], + [ + "sl_sst", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-test.conllu" + ], + [ + "uk_iu", + "corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-dev.conllu" + ], + [ + "hsb_ufal", + "corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu" + ], + [ + "hi_hdtb", + "corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu" + ], + [ + "mr_ufal", + "corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-dev.conllu" + ], + [ + "ur_udtb", + "corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-dev.conllu" + ], + [ + "ga_idt", + "corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-dev.conllu" + ], + [ + "gd_arcosg", + "corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu" + ], + [ + "bxr_bdt", + "corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-test.conllu" + ], + [ + "zh_gsd", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-dev.conllu" + ], + [ + "zh_gsdsimp", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu" + ], + [ + "lzh_kyoto", + "corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu" + ], + [ + "ca_ancora", + "corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-dev.conllu" + ], + [ + "fr_partut", + "corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-dev.conllu" + ], + [ + "fro_srcmf", + "corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu" + ], + [ + "fr_ftb", + "corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-dev.conllu" + ], + [ + "fr_spoken", + "corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-dev.conllu" + ], + [ + "fr_sequoia", + "corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-dev.conllu" + ], + [ + "fr_gsd", + "corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-dev.conllu" + ], + [ + "gl_treegal", + "corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-test.conllu" + ], + [ + "gl_ctg", + "corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-dev.conllu" + ], + [ + "it_vit", + "corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-dev.conllu" + ], + [ + "it_postwita", + "corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu" + ], + [ + "it_partut", + "corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-dev.conllu" + ], + [ + "it_twittiro", + "corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu" + ], + [ + "it_isdt", + "corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-dev.conllu" + ], + [ + "pt_gsd", + "corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-dev.conllu" + ], + [ + "pt_bosque", + "corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu" + ], + [ + "ro_nonstandard", + "corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu" + ], + [ + "ro_rrt", + "corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-dev.conllu" + ], + [ + "es_ancora", + "corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-dev.conllu" + ], + [ + "es_gsd", + "corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-dev.conllu" + ], + [ + "et_edt", + "corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-dev.conllu" + ], + [ + "et_ewt", + "corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu" + ], + [ + "fi_ftb", + "corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-dev.conllu" + ], + [ + "fi_tdt", + "corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-dev.conllu" + ], + [ + "hu_szeged", + "corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu" + ], + [ + "olo_kkpp", + "corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu" + ], + [ + "sme_giella", + "corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu" + ], + [ + "id_gsd", + "corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-dev.conllu" + ], + [ + "ja_gsd", + "corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-dev.conllu" + ], + [ + "kk_ktb", + "corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu" + ], + [ + "tr_imst", + "corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-dev.conllu" + ], + [ + "ug_udt", + "corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-dev.conllu" + ], + [ + "ko_gsd", + "corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-dev.conllu" + ], + [ + "ko_kaist", + "corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-dev.conllu" + ], + [ + "kmr_mg", + "corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-test.conllu" + ], + [ + "fa_seraji", + "corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-dev.conllu" + ], + [ + "la_perseus", + "corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-test.conllu" + ], + [ + "la_proiel", + "corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-dev.conllu" + ], + [ + "la_ittb", + "corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-dev.conllu" + ], + [ + "lv_lvtb", + "corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu" + ], + [ + "lt_alksnis", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu" + ], + [ + "lt_hse", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-dev.conllu" + ], + [ + "ta_ttb", + "corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-dev.conllu" + ], + [ + "te_mtg", + "corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-dev.conllu" + ], + [ + "vi_vtb", + "corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu" + ], + [ + "wo_wtb", + "corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/armenian.json b/examples/multilanguage/autogenerated/armenian.json new file mode 100644 index 000000000..0a960ec48 --- /dev/null +++ b/examples/multilanguage/autogenerated/armenian.json @@ -0,0 +1,7 @@ +[ + [ + "hy_armtdp", + "corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/austro-asiatic.json b/examples/multilanguage/autogenerated/austro-asiatic.json new file mode 100644 index 000000000..abef5dc65 --- /dev/null +++ b/examples/multilanguage/autogenerated/austro-asiatic.json @@ -0,0 +1,7 @@ +[ + [ + "vi_vtb", + "corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/austronesian.json b/examples/multilanguage/autogenerated/austronesian.json new file mode 100644 index 000000000..280839fc6 --- /dev/null +++ b/examples/multilanguage/autogenerated/austronesian.json @@ -0,0 +1,7 @@ +[ + [ + "id_gsd", + "corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/baltic.json b/examples/multilanguage/autogenerated/baltic.json new file mode 100644 index 000000000..a964ae427 --- /dev/null +++ b/examples/multilanguage/autogenerated/baltic.json @@ -0,0 +1,17 @@ +[ + [ + "lv_lvtb", + "corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu" + ], + [ + "lt_alksnis", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu" + ], + [ + "lt_hse", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/basque.json b/examples/multilanguage/autogenerated/basque.json new file mode 100644 index 000000000..3ec1351a4 --- /dev/null +++ b/examples/multilanguage/autogenerated/basque.json @@ -0,0 +1,7 @@ +[ + [ + "eu_bdt", + "corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/celtic.json b/examples/multilanguage/autogenerated/celtic.json new file mode 100644 index 000000000..782d2b6e2 --- /dev/null +++ b/examples/multilanguage/autogenerated/celtic.json @@ -0,0 +1,12 @@ +[ + [ + "ga_idt", + "corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-dev.conllu" + ], + [ + "gd_arcosg", + "corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/creole.json b/examples/multilanguage/autogenerated/creole.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/examples/multilanguage/autogenerated/creole.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/dravidian.json b/examples/multilanguage/autogenerated/dravidian.json new file mode 100644 index 000000000..dbfb60bd8 --- /dev/null +++ b/examples/multilanguage/autogenerated/dravidian.json @@ -0,0 +1,12 @@ +[ + [ + "ta_ttb", + "corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-dev.conllu" + ], + [ + "te_mtg", + "corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/germanic.json b/examples/multilanguage/autogenerated/germanic.json new file mode 100644 index 000000000..b5f0d5f44 --- /dev/null +++ b/examples/multilanguage/autogenerated/germanic.json @@ -0,0 +1,87 @@ +[ + [ + "af_afribooms", + "corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu" + ], + [ + "da_ddt", + "corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-dev.conllu" + ], + [ + "nl_lassysmall", + "corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu" + ], + [ + "nl_alpino", + "corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-dev.conllu" + ], + [ + "en_lines", + "corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-dev.conllu" + ], + [ + "en_gum", + "corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-dev.conllu" + ], + [ + "en_ewt", + "corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-dev.conllu" + ], + [ + "en_partut", + "corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-dev.conllu" + ], + [ + "de_gsd", + "corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-dev.conllu" + ], + [ + "de_hdt", + "corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-dev.conllu" + ], + [ + "got_proiel", + "corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-dev.conllu" + ], + [ + "no_nynorsk", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu" + ], + [ + "no_bokmaal", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu" + ], + [ + "no_nynorsklia", + "corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu" + ], + [ + "swl_sslc", + "corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu" + ], + [ + "sv_talbanken", + "corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu" + ], + [ + "sv_lines", + "corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/greek.json b/examples/multilanguage/autogenerated/greek.json new file mode 100644 index 000000000..698fb30fc --- /dev/null +++ b/examples/multilanguage/autogenerated/greek.json @@ -0,0 +1,17 @@ +[ + [ + "grc_perseus", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu" + ], + [ + "grc_proiel", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu" + ], + [ + "el_gdt", + "corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/indic.json b/examples/multilanguage/autogenerated/indic.json new file mode 100644 index 000000000..e1a8a23ba --- /dev/null +++ b/examples/multilanguage/autogenerated/indic.json @@ -0,0 +1,17 @@ +[ + [ + "hi_hdtb", + "corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu" + ], + [ + "mr_ufal", + "corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-dev.conllu" + ], + [ + "ur_udtb", + "corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/iranian.json b/examples/multilanguage/autogenerated/iranian.json new file mode 100644 index 000000000..9757a595d --- /dev/null +++ b/examples/multilanguage/autogenerated/iranian.json @@ -0,0 +1,12 @@ +[ + [ + "kmr_mg", + "corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-test.conllu" + ], + [ + "fa_seraji", + "corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/japanese.json b/examples/multilanguage/autogenerated/japanese.json new file mode 100644 index 000000000..a07b2f897 --- /dev/null +++ b/examples/multilanguage/autogenerated/japanese.json @@ -0,0 +1,7 @@ +[ + [ + "ja_gsd", + "corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/korean.json b/examples/multilanguage/autogenerated/korean.json new file mode 100644 index 000000000..e2ada3507 --- /dev/null +++ b/examples/multilanguage/autogenerated/korean.json @@ -0,0 +1,12 @@ +[ + [ + "ko_gsd", + "corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-dev.conllu" + ], + [ + "ko_kaist", + "corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/latin.json b/examples/multilanguage/autogenerated/latin.json new file mode 100644 index 000000000..7c713f1b2 --- /dev/null +++ b/examples/multilanguage/autogenerated/latin.json @@ -0,0 +1,17 @@ +[ + [ + "la_perseus", + "corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-test.conllu" + ], + [ + "la_proiel", + "corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-dev.conllu" + ], + [ + "la_ittb", + "corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/mande.json b/examples/multilanguage/autogenerated/mande.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/examples/multilanguage/autogenerated/mande.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/mongolic.json b/examples/multilanguage/autogenerated/mongolic.json new file mode 100644 index 000000000..eb1da4930 --- /dev/null +++ b/examples/multilanguage/autogenerated/mongolic.json @@ -0,0 +1,7 @@ +[ + [ + "bxr_bdt", + "corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/niger-congo.json b/examples/multilanguage/autogenerated/niger-congo.json new file mode 100644 index 000000000..5ef8afec9 --- /dev/null +++ b/examples/multilanguage/autogenerated/niger-congo.json @@ -0,0 +1,7 @@ +[ + [ + "wo_wtb", + "corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/pama-nyungan.json b/examples/multilanguage/autogenerated/pama-nyungan.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/examples/multilanguage/autogenerated/pama-nyungan.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/romance.json b/examples/multilanguage/autogenerated/romance.json new file mode 100644 index 000000000..ca3e92ea7 --- /dev/null +++ b/examples/multilanguage/autogenerated/romance.json @@ -0,0 +1,102 @@ +[ + [ + "ca_ancora", + "corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-dev.conllu" + ], + [ + "fr_partut", + "corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-dev.conllu" + ], + [ + "fro_srcmf", + "corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu" + ], + [ + "fr_ftb", + "corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-dev.conllu" + ], + [ + "fr_spoken", + "corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-dev.conllu" + ], + [ + "fr_sequoia", + "corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-dev.conllu" + ], + [ + "fr_gsd", + "corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-dev.conllu" + ], + [ + "gl_treegal", + "corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-test.conllu" + ], + [ + "gl_ctg", + "corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-dev.conllu" + ], + [ + "it_vit", + "corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-dev.conllu" + ], + [ + "it_postwita", + "corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu" + ], + [ + "it_partut", + "corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-dev.conllu" + ], + [ + "it_twittiro", + "corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu" + ], + [ + "it_isdt", + "corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-dev.conllu" + ], + [ + "pt_gsd", + "corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-dev.conllu" + ], + [ + "pt_bosque", + "corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu" + ], + [ + "ro_nonstandard", + "corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu" + ], + [ + "ro_rrt", + "corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-dev.conllu" + ], + [ + "es_ancora", + "corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-dev.conllu" + ], + [ + "es_gsd", + "corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/sino-tibetan.json b/examples/multilanguage/autogenerated/sino-tibetan.json new file mode 100644 index 000000000..dcfcce434 --- /dev/null +++ b/examples/multilanguage/autogenerated/sino-tibetan.json @@ -0,0 +1,17 @@ +[ + [ + "zh_gsd", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-dev.conllu" + ], + [ + "zh_gsdsimp", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu" + ], + [ + "lzh_kyoto", + "corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/slavic.json b/examples/multilanguage/autogenerated/slavic.json new file mode 100644 index 000000000..9174f7dad --- /dev/null +++ b/examples/multilanguage/autogenerated/slavic.json @@ -0,0 +1,102 @@ +[ + [ + "be_hse", + "corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-dev.conllu" + ], + [ + "bg_btb", + "corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-dev.conllu" + ], + [ + "hr_set", + "corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-dev.conllu" + ], + [ + "cs_pdt", + "corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-dev.conllu" + ], + [ + "cs_cltt", + "corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-dev.conllu" + ], + [ + "cs_fictree", + "corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-dev.conllu" + ], + [ + "cs_cac", + "corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-dev.conllu" + ], + [ + "cu_proiel", + "corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu" + ], + [ + "orv_torot", + "corpus/ud-treebanks-v2.5/UD_Old_Russian-TOROT/orv_torot-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Old_Russian-TOROT/orv_torot-ud-dev.conllu" + ], + [ + "pl_lfg", + "corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-dev.conllu" + ], + [ + "pl_pdb", + "corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-dev.conllu" + ], + [ + "ru_gsd", + "corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-dev.conllu" + ], + [ + "ru_syntagrus", + "corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu" + ], + [ + "ru_taiga", + "corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-dev.conllu" + ], + [ + "sr_set", + "corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-dev.conllu" + ], + [ + "sk_snk", + "corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-dev.conllu" + ], + [ + "sl_ssj", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu" + ], + [ + "sl_sst", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-test.conllu" + ], + [ + "uk_iu", + "corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-dev.conllu" + ], + [ + "hsb_ufal", + "corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/tai-kadai.json b/examples/multilanguage/autogenerated/tai-kadai.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/examples/multilanguage/autogenerated/tai-kadai.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/tupian.json b/examples/multilanguage/autogenerated/tupian.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/examples/multilanguage/autogenerated/tupian.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/turkic.json b/examples/multilanguage/autogenerated/turkic.json new file mode 100644 index 000000000..5553a4995 --- /dev/null +++ b/examples/multilanguage/autogenerated/turkic.json @@ -0,0 +1,17 @@ +[ + [ + "kk_ktb", + "corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu" + ], + [ + "tr_imst", + "corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-dev.conllu" + ], + [ + "ug_udt", + "corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/autogenerated/uralic.json b/examples/multilanguage/autogenerated/uralic.json new file mode 100644 index 000000000..e5c3b8e2d --- /dev/null +++ b/examples/multilanguage/autogenerated/uralic.json @@ -0,0 +1,37 @@ +[ + [ + "et_edt", + "corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-dev.conllu" + ], + [ + "et_ewt", + "corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu" + ], + [ + "fi_ftb", + "corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-dev.conllu" + ], + [ + "fi_tdt", + "corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-dev.conllu" + ], + [ + "hu_szeged", + "corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu" + ], + [ + "olo_kkpp", + "corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu" + ], + [ + "sme_giella", + "corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/afrikaans.json b/examples/multilanguage/languages/afrikaans.json new file mode 100644 index 000000000..d13dabf10 --- /dev/null +++ b/examples/multilanguage/languages/afrikaans.json @@ -0,0 +1,7 @@ +[ + [ + "af_afribooms", + "corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/ancient_greek.json b/examples/multilanguage/languages/ancient_greek.json new file mode 100644 index 000000000..365ab5fd9 --- /dev/null +++ b/examples/multilanguage/languages/ancient_greek.json @@ -0,0 +1,12 @@ +[ + [ + "grc_perseus", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu" + ], + [ + "grc_proiel", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/arabic.json b/examples/multilanguage/languages/arabic.json new file mode 100644 index 000000000..02e4dde85 --- /dev/null +++ b/examples/multilanguage/languages/arabic.json @@ -0,0 +1,7 @@ +[ + [ + "ar_padt", + "corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/armenian.json b/examples/multilanguage/languages/armenian.json new file mode 100644 index 000000000..0a960ec48 --- /dev/null +++ b/examples/multilanguage/languages/armenian.json @@ -0,0 +1,7 @@ +[ + [ + "hy_armtdp", + "corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/basque.json b/examples/multilanguage/languages/basque.json new file mode 100644 index 000000000..3ec1351a4 --- /dev/null +++ b/examples/multilanguage/languages/basque.json @@ -0,0 +1,7 @@ +[ + [ + "eu_bdt", + "corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/belarusian.json b/examples/multilanguage/languages/belarusian.json new file mode 100644 index 000000000..1df3802e2 --- /dev/null +++ b/examples/multilanguage/languages/belarusian.json @@ -0,0 +1,7 @@ +[ + [ + "be_hse", + "corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/bulgarian.json b/examples/multilanguage/languages/bulgarian.json new file mode 100644 index 000000000..71fc0d3f1 --- /dev/null +++ b/examples/multilanguage/languages/bulgarian.json @@ -0,0 +1,7 @@ +[ + [ + "bg_btb", + "corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/buryat.json b/examples/multilanguage/languages/buryat.json new file mode 100644 index 000000000..eb1da4930 --- /dev/null +++ b/examples/multilanguage/languages/buryat.json @@ -0,0 +1,7 @@ +[ + [ + "bxr_bdt", + "corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/catalan.json b/examples/multilanguage/languages/catalan.json new file mode 100644 index 000000000..9cb5e4b18 --- /dev/null +++ b/examples/multilanguage/languages/catalan.json @@ -0,0 +1,7 @@ +[ + [ + "ca_ancora", + "corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/chinese.json b/examples/multilanguage/languages/chinese.json new file mode 100644 index 000000000..14290e8e1 --- /dev/null +++ b/examples/multilanguage/languages/chinese.json @@ -0,0 +1,12 @@ +[ + [ + "zh_gsd", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-dev.conllu" + ], + [ + "zh_gsdsimp", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/classical_chinese.json b/examples/multilanguage/languages/classical_chinese.json new file mode 100644 index 000000000..f4250445e --- /dev/null +++ b/examples/multilanguage/languages/classical_chinese.json @@ -0,0 +1,7 @@ +[ + [ + "lzh_kyoto", + "corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/coptic.json b/examples/multilanguage/languages/coptic.json new file mode 100644 index 000000000..8d6b9c883 --- /dev/null +++ b/examples/multilanguage/languages/coptic.json @@ -0,0 +1,7 @@ +[ + [ + "cop_scriptorium", + "corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/croatian.json b/examples/multilanguage/languages/croatian.json new file mode 100644 index 000000000..1e9d540c6 --- /dev/null +++ b/examples/multilanguage/languages/croatian.json @@ -0,0 +1,7 @@ +[ + [ + "hr_set", + "corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/czech.json b/examples/multilanguage/languages/czech.json new file mode 100644 index 000000000..d1a4295f8 --- /dev/null +++ b/examples/multilanguage/languages/czech.json @@ -0,0 +1,22 @@ +[ + [ + "cs_cac", + "corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-dev.conllu" + ], + [ + "cs_cltt", + "corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-dev.conllu" + ], + [ + "cs_fictree", + "corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-dev.conllu" + ], + [ + "cs_pdt", + "corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/danish.json b/examples/multilanguage/languages/danish.json new file mode 100644 index 000000000..e71b46dd2 --- /dev/null +++ b/examples/multilanguage/languages/danish.json @@ -0,0 +1,7 @@ +[ + [ + "da_ddt", + "corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/dutch.json b/examples/multilanguage/languages/dutch.json new file mode 100644 index 000000000..26ce45d17 --- /dev/null +++ b/examples/multilanguage/languages/dutch.json @@ -0,0 +1,12 @@ +[ + [ + "nl_alpino", + "corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-dev.conllu" + ], + [ + "nl_lassysmall", + "corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/english.json b/examples/multilanguage/languages/english.json new file mode 100644 index 000000000..798b4b09a --- /dev/null +++ b/examples/multilanguage/languages/english.json @@ -0,0 +1,22 @@ +[ + [ + "en_ewt", + "corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-dev.conllu" + ], + [ + "en_gum", + "corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-dev.conllu" + ], + [ + "en_lines", + "corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-dev.conllu" + ], + [ + "en_partut", + "corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/estonian.json b/examples/multilanguage/languages/estonian.json new file mode 100644 index 000000000..61c8a7fad --- /dev/null +++ b/examples/multilanguage/languages/estonian.json @@ -0,0 +1,12 @@ +[ + [ + "et_edt", + "corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-dev.conllu" + ], + [ + "et_ewt", + "corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/finnish.json b/examples/multilanguage/languages/finnish.json new file mode 100644 index 000000000..2fa56e182 --- /dev/null +++ b/examples/multilanguage/languages/finnish.json @@ -0,0 +1,12 @@ +[ + [ + "fi_ftb", + "corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-dev.conllu" + ], + [ + "fi_tdt", + "corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/french.json b/examples/multilanguage/languages/french.json new file mode 100644 index 000000000..2dded7ffe --- /dev/null +++ b/examples/multilanguage/languages/french.json @@ -0,0 +1,27 @@ +[ + [ + "fr_ftb", + "corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-dev.conllu" + ], + [ + "fr_gsd", + "corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-dev.conllu" + ], + [ + "fr_partut", + "corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-dev.conllu" + ], + [ + "fr_sequoia", + "corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-dev.conllu" + ], + [ + "fr_spoken", + "corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/galician.json b/examples/multilanguage/languages/galician.json new file mode 100644 index 000000000..5ab169be0 --- /dev/null +++ b/examples/multilanguage/languages/galician.json @@ -0,0 +1,12 @@ +[ + [ + "gl_ctg", + "corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-dev.conllu" + ], + [ + "gl_treegal", + "corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/german.json b/examples/multilanguage/languages/german.json new file mode 100644 index 000000000..ec460c9c1 --- /dev/null +++ b/examples/multilanguage/languages/german.json @@ -0,0 +1,12 @@ +[ + [ + "de_gsd", + "corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-dev.conllu" + ], + [ + "de_hdt", + "corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/gothic.json b/examples/multilanguage/languages/gothic.json new file mode 100644 index 000000000..c513c273d --- /dev/null +++ b/examples/multilanguage/languages/gothic.json @@ -0,0 +1,7 @@ +[ + [ + "got_proiel", + "corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/greek.json b/examples/multilanguage/languages/greek.json new file mode 100644 index 000000000..76085a5cc --- /dev/null +++ b/examples/multilanguage/languages/greek.json @@ -0,0 +1,7 @@ +[ + [ + "el_gdt", + "corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/hebrew.json b/examples/multilanguage/languages/hebrew.json new file mode 100644 index 000000000..edd360c9b --- /dev/null +++ b/examples/multilanguage/languages/hebrew.json @@ -0,0 +1,7 @@ +[ + [ + "he_htb", + "corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/hindi.json b/examples/multilanguage/languages/hindi.json new file mode 100644 index 000000000..b250ae21b --- /dev/null +++ b/examples/multilanguage/languages/hindi.json @@ -0,0 +1,7 @@ +[ + [ + "hi_hdtb", + "corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/hungarian.json b/examples/multilanguage/languages/hungarian.json new file mode 100644 index 000000000..d69bd4d70 --- /dev/null +++ b/examples/multilanguage/languages/hungarian.json @@ -0,0 +1,7 @@ +[ + [ + "hu_szeged", + "corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/indonesian.json b/examples/multilanguage/languages/indonesian.json new file mode 100644 index 000000000..280839fc6 --- /dev/null +++ b/examples/multilanguage/languages/indonesian.json @@ -0,0 +1,7 @@ +[ + [ + "id_gsd", + "corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/irish.json b/examples/multilanguage/languages/irish.json new file mode 100644 index 000000000..c562fb030 --- /dev/null +++ b/examples/multilanguage/languages/irish.json @@ -0,0 +1,7 @@ +[ + [ + "ga_idt", + "corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/italian.json b/examples/multilanguage/languages/italian.json new file mode 100644 index 000000000..5a02de18d --- /dev/null +++ b/examples/multilanguage/languages/italian.json @@ -0,0 +1,27 @@ +[ + [ + "it_isdt", + "corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-dev.conllu" + ], + [ + "it_partut", + "corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-dev.conllu" + ], + [ + "it_postwita", + "corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu" + ], + [ + "it_twittiro", + "corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu" + ], + [ + "it_vit", + "corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/japanese.json b/examples/multilanguage/languages/japanese.json new file mode 100644 index 000000000..a07b2f897 --- /dev/null +++ b/examples/multilanguage/languages/japanese.json @@ -0,0 +1,7 @@ +[ + [ + "ja_gsd", + "corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/kazakh.json b/examples/multilanguage/languages/kazakh.json new file mode 100644 index 000000000..b4f91afd8 --- /dev/null +++ b/examples/multilanguage/languages/kazakh.json @@ -0,0 +1,7 @@ +[ + [ + "kk_ktb", + "corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/korean.json b/examples/multilanguage/languages/korean.json new file mode 100644 index 000000000..e2ada3507 --- /dev/null +++ b/examples/multilanguage/languages/korean.json @@ -0,0 +1,12 @@ +[ + [ + "ko_gsd", + "corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-dev.conllu" + ], + [ + "ko_kaist", + "corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/kurmanji.json b/examples/multilanguage/languages/kurmanji.json new file mode 100644 index 000000000..8fb38cb03 --- /dev/null +++ b/examples/multilanguage/languages/kurmanji.json @@ -0,0 +1,7 @@ +[ + [ + "kmr_mg", + "corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/latin.json b/examples/multilanguage/languages/latin.json new file mode 100644 index 000000000..d882860fd --- /dev/null +++ b/examples/multilanguage/languages/latin.json @@ -0,0 +1,17 @@ +[ + [ + "la_ittb", + "corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-dev.conllu" + ], + [ + "la_perseus", + "corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-test.conllu" + ], + [ + "la_proiel", + "corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/latvian.json b/examples/multilanguage/languages/latvian.json new file mode 100644 index 000000000..557a24059 --- /dev/null +++ b/examples/multilanguage/languages/latvian.json @@ -0,0 +1,7 @@ +[ + [ + "lv_lvtb", + "corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/lithuanian.json b/examples/multilanguage/languages/lithuanian.json new file mode 100644 index 000000000..0362d91d5 --- /dev/null +++ b/examples/multilanguage/languages/lithuanian.json @@ -0,0 +1,12 @@ +[ + [ + "lt_alksnis", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu" + ], + [ + "lt_hse", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/livvi.json b/examples/multilanguage/languages/livvi.json new file mode 100644 index 000000000..3c0566947 --- /dev/null +++ b/examples/multilanguage/languages/livvi.json @@ -0,0 +1,7 @@ +[ + [ + "olo_kkpp", + "corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/maltese.json b/examples/multilanguage/languages/maltese.json new file mode 100644 index 000000000..943d1ab96 --- /dev/null +++ b/examples/multilanguage/languages/maltese.json @@ -0,0 +1,7 @@ +[ + [ + "mt_mudt", + "corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/marathi.json b/examples/multilanguage/languages/marathi.json new file mode 100644 index 000000000..5542a9cd0 --- /dev/null +++ b/examples/multilanguage/languages/marathi.json @@ -0,0 +1,7 @@ +[ + [ + "mr_ufal", + "corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/north_sami.json b/examples/multilanguage/languages/north_sami.json new file mode 100644 index 000000000..e9473e648 --- /dev/null +++ b/examples/multilanguage/languages/north_sami.json @@ -0,0 +1,7 @@ +[ + [ + "sme_giella", + "corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/norwegian.json b/examples/multilanguage/languages/norwegian.json new file mode 100644 index 000000000..58d035bf7 --- /dev/null +++ b/examples/multilanguage/languages/norwegian.json @@ -0,0 +1,17 @@ +[ + [ + "no_bokmaal", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu" + ], + [ + "no_nynorsk", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu" + ], + [ + "no_nynorsklia", + "corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/old_church_slavonic.json b/examples/multilanguage/languages/old_church_slavonic.json new file mode 100644 index 000000000..6735ed985 --- /dev/null +++ b/examples/multilanguage/languages/old_church_slavonic.json @@ -0,0 +1,7 @@ +[ + [ + "cu_proiel", + "corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/old_french.json b/examples/multilanguage/languages/old_french.json new file mode 100644 index 000000000..8de50a8ec --- /dev/null +++ b/examples/multilanguage/languages/old_french.json @@ -0,0 +1,7 @@ +[ + [ + "fro_srcmf", + "corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/old_russian.json b/examples/multilanguage/languages/old_russian.json new file mode 100644 index 000000000..9dca0a859 --- /dev/null +++ b/examples/multilanguage/languages/old_russian.json @@ -0,0 +1,7 @@ +[ + [ + "orv_torot", + "corpus/ud-treebanks-v2.5/UD_Old_Russian-TOROT/orv_torot-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Old_Russian-TOROT/orv_torot-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/persian.json b/examples/multilanguage/languages/persian.json new file mode 100644 index 000000000..c3f21c7ae --- /dev/null +++ b/examples/multilanguage/languages/persian.json @@ -0,0 +1,7 @@ +[ + [ + "fa_seraji", + "corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/polish.json b/examples/multilanguage/languages/polish.json new file mode 100644 index 000000000..3d3acad94 --- /dev/null +++ b/examples/multilanguage/languages/polish.json @@ -0,0 +1,12 @@ +[ + [ + "pl_lfg", + "corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-dev.conllu" + ], + [ + "pl_pdb", + "corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/portuguese.json b/examples/multilanguage/languages/portuguese.json new file mode 100644 index 000000000..be967dbc5 --- /dev/null +++ b/examples/multilanguage/languages/portuguese.json @@ -0,0 +1,12 @@ +[ + [ + "pt_bosque", + "corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu" + ], + [ + "pt_gsd", + "corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/romanian.json b/examples/multilanguage/languages/romanian.json new file mode 100644 index 000000000..08d33c146 --- /dev/null +++ b/examples/multilanguage/languages/romanian.json @@ -0,0 +1,12 @@ +[ + [ + "ro_nonstandard", + "corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu" + ], + [ + "ro_rrt", + "corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/russian.json b/examples/multilanguage/languages/russian.json new file mode 100644 index 000000000..c49bf8b5d --- /dev/null +++ b/examples/multilanguage/languages/russian.json @@ -0,0 +1,17 @@ +[ + [ + "ru_gsd", + "corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-dev.conllu" + ], + [ + "ru_syntagrus", + "corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu" + ], + [ + "ru_taiga", + "corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/scottish_gaelic.json b/examples/multilanguage/languages/scottish_gaelic.json new file mode 100644 index 000000000..2afe0e798 --- /dev/null +++ b/examples/multilanguage/languages/scottish_gaelic.json @@ -0,0 +1,7 @@ +[ + [ + "gd_arcosg", + "corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/serbian.json b/examples/multilanguage/languages/serbian.json new file mode 100644 index 000000000..7048212a7 --- /dev/null +++ b/examples/multilanguage/languages/serbian.json @@ -0,0 +1,7 @@ +[ + [ + "sr_set", + "corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/slovak.json b/examples/multilanguage/languages/slovak.json new file mode 100644 index 000000000..bf31a5152 --- /dev/null +++ b/examples/multilanguage/languages/slovak.json @@ -0,0 +1,7 @@ +[ + [ + "sk_snk", + "corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/slovenian.json b/examples/multilanguage/languages/slovenian.json new file mode 100644 index 000000000..99b560ea2 --- /dev/null +++ b/examples/multilanguage/languages/slovenian.json @@ -0,0 +1,12 @@ +[ + [ + "sl_ssj", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu" + ], + [ + "sl_sst", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/spanish.json b/examples/multilanguage/languages/spanish.json new file mode 100644 index 000000000..5e8e3e86f --- /dev/null +++ b/examples/multilanguage/languages/spanish.json @@ -0,0 +1,12 @@ +[ + [ + "es_ancora", + "corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-dev.conllu" + ], + [ + "es_gsd", + "corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/swedish.json b/examples/multilanguage/languages/swedish.json new file mode 100644 index 000000000..7440713f0 --- /dev/null +++ b/examples/multilanguage/languages/swedish.json @@ -0,0 +1,12 @@ +[ + [ + "sv_lines", + "corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-dev.conllu" + ], + [ + "sv_talbanken", + "corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/swedish_sign_language.json b/examples/multilanguage/languages/swedish_sign_language.json new file mode 100644 index 000000000..a149170d3 --- /dev/null +++ b/examples/multilanguage/languages/swedish_sign_language.json @@ -0,0 +1,7 @@ +[ + [ + "swl_sslc", + "corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/tamil.json b/examples/multilanguage/languages/tamil.json new file mode 100644 index 000000000..bc3bc7d3a --- /dev/null +++ b/examples/multilanguage/languages/tamil.json @@ -0,0 +1,7 @@ +[ + [ + "ta_ttb", + "corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/telugu.json b/examples/multilanguage/languages/telugu.json new file mode 100644 index 000000000..1dc8da368 --- /dev/null +++ b/examples/multilanguage/languages/telugu.json @@ -0,0 +1,7 @@ +[ + [ + "te_mtg", + "corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/train_compound.sh b/examples/multilanguage/languages/train_compound.sh new file mode 100755 index 000000000..db7d2ef98 --- /dev/null +++ b/examples/multilanguage/languages/train_compound.sh @@ -0,0 +1,67 @@ +#!/bin/bash +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-afrikaans --train=examples/multilanguage/languages/afrikaans.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-ancient_greek --train=examples/multilanguage/languages/ancient_greek.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-arabic --train=examples/multilanguage/languages/arabic.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-armenian --train=examples/multilanguage/languages/armenian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-basque --train=examples/multilanguage/languages/basque.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-belarusian --train=examples/multilanguage/languages/belarusian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-bulgarian --train=examples/multilanguage/languages/bulgarian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-buryat --train=examples/multilanguage/languages/buryat.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-catalan --train=examples/multilanguage/languages/catalan.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-chinese --train=examples/multilanguage/languages/chinese.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-classical_chinese --train=examples/multilanguage/languages/classical_chinese.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-coptic --train=examples/multilanguage/languages/coptic.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-croatian --train=examples/multilanguage/languages/croatian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-czech --train=examples/multilanguage/languages/czech.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-danish --train=examples/multilanguage/languages/danish.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-dutch --train=examples/multilanguage/languages/dutch.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-english --train=examples/multilanguage/languages/english.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-estonian --train=examples/multilanguage/languages/estonian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-finnish --train=examples/multilanguage/languages/finnish.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-french --train=examples/multilanguage/languages/french.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-galician --train=examples/multilanguage/languages/galician.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-german --train=examples/multilanguage/languages/german.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-gothic --train=examples/multilanguage/languages/gothic.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-greek --train=examples/multilanguage/languages/greek.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-hebrew --train=examples/multilanguage/languages/hebrew.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-hindi --train=examples/multilanguage/languages/hindi.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-hungarian --train=examples/multilanguage/languages/hungarian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-indonesian --train=examples/multilanguage/languages/indonesian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-irish --train=examples/multilanguage/languages/irish.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-italian --train=examples/multilanguage/languages/italian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-japanese --train=examples/multilanguage/languages/japanese.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-kazakh --train=examples/multilanguage/languages/kazakh.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-korean --train=examples/multilanguage/languages/korean.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-kurmanji --train=examples/multilanguage/languages/kurmanji.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-latin --train=examples/multilanguage/languages/latin.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-latvian --train=examples/multilanguage/languages/latvian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-lithuanian --train=examples/multilanguage/languages/lithuanian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-livvi --train=examples/multilanguage/languages/livvi.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-maltese --train=examples/multilanguage/languages/maltese.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-marathi --train=examples/multilanguage/languages/marathi.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-north_sami --train=examples/multilanguage/languages/north_sami.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-norwegian --train=examples/multilanguage/languages/norwegian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-old_church_slavonic --train=examples/multilanguage/languages/old_church_slavonic.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-old_french --train=examples/multilanguage/languages/old_french.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-old_russian --train=examples/multilanguage/languages/old_russian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-persian --train=examples/multilanguage/languages/persian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-polish --train=examples/multilanguage/languages/polish.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-portuguese --train=examples/multilanguage/languages/portuguese.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-romanian --train=examples/multilanguage/languages/romanian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-russian --train=examples/multilanguage/languages/russian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-scottish_gaelic --train=examples/multilanguage/languages/scottish_gaelic.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-serbian --train=examples/multilanguage/languages/serbian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-slovak --train=examples/multilanguage/languages/slovak.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-slovenian --train=examples/multilanguage/languages/slovenian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-spanish --train=examples/multilanguage/languages/spanish.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-swedish --train=examples/multilanguage/languages/swedish.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-swedish_sign_language --train=examples/multilanguage/languages/swedish_sign_language.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-tamil --train=examples/multilanguage/languages/tamil.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-telugu --train=examples/multilanguage/languages/telugu.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-turkish --train=examples/multilanguage/languages/turkish.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-ukrainian --train=examples/multilanguage/languages/ukrainian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-upper_sorbian --train=examples/multilanguage/languages/upper_sorbian.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-urdu --train=examples/multilanguage/languages/urdu.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-uyghur --train=examples/multilanguage/languages/uyghur.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-vietnamese --train=examples/multilanguage/languages/vietnamese.json +python3 cube/networks/compound.py --batch-size=32 --device=cuda:0 --store=data/compound-wolof --train=examples/multilanguage/languages/wolof.json diff --git a/examples/multilanguage/languages/train_lemmatizers.sh b/examples/multilanguage/languages/train_lemmatizers.sh new file mode 100755 index 000000000..b9dbd2174 --- /dev/null +++ b/examples/multilanguage/languages/train_lemmatizers.sh @@ -0,0 +1,67 @@ +#!/bin/bash +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-afrikaans --train=examples/multilanguage/languages/afrikaans.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-ancient_greek --train=examples/multilanguage/languages/ancient_greek.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-arabic --train=examples/multilanguage/languages/arabic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-armenian --train=examples/multilanguage/languages/armenian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-basque --train=examples/multilanguage/languages/basque.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-belarusian --train=examples/multilanguage/languages/belarusian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-bulgarian --train=examples/multilanguage/languages/bulgarian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-buryat --train=examples/multilanguage/languages/buryat.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-catalan --train=examples/multilanguage/languages/catalan.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-chinese --train=examples/multilanguage/languages/chinese.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-classical_chinese --train=examples/multilanguage/languages/classical_chinese.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-coptic --train=examples/multilanguage/languages/coptic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-croatian --train=examples/multilanguage/languages/croatian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-czech --train=examples/multilanguage/languages/czech.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-danish --train=examples/multilanguage/languages/danish.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-dutch --train=examples/multilanguage/languages/dutch.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-english --train=examples/multilanguage/languages/english.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-estonian --train=examples/multilanguage/languages/estonian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-finnish --train=examples/multilanguage/languages/finnish.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-french --train=examples/multilanguage/languages/french.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-galician --train=examples/multilanguage/languages/galician.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-german --train=examples/multilanguage/languages/german.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-gothic --train=examples/multilanguage/languages/gothic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-greek --train=examples/multilanguage/languages/greek.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-hebrew --train=examples/multilanguage/languages/hebrew.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-hindi --train=examples/multilanguage/languages/hindi.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-hungarian --train=examples/multilanguage/languages/hungarian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-indonesian --train=examples/multilanguage/languages/indonesian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-irish --train=examples/multilanguage/languages/irish.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-italian --train=examples/multilanguage/languages/italian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-japanese --train=examples/multilanguage/languages/japanese.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-kazakh --train=examples/multilanguage/languages/kazakh.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-korean --train=examples/multilanguage/languages/korean.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-kurmanji --train=examples/multilanguage/languages/kurmanji.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-latin --train=examples/multilanguage/languages/latin.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-latvian --train=examples/multilanguage/languages/latvian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-lithuanian --train=examples/multilanguage/languages/lithuanian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-livvi --train=examples/multilanguage/languages/livvi.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-maltese --train=examples/multilanguage/languages/maltese.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-marathi --train=examples/multilanguage/languages/marathi.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-north_sami --train=examples/multilanguage/languages/north_sami.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-norwegian --train=examples/multilanguage/languages/norwegian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-old_church_slavonic --train=examples/multilanguage/languages/old_church_slavonic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-old_french --train=examples/multilanguage/languages/old_french.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-old_russian --train=examples/multilanguage/languages/old_russian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-persian --train=examples/multilanguage/languages/persian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-polish --train=examples/multilanguage/languages/polish.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-portuguese --train=examples/multilanguage/languages/portuguese.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-romanian --train=examples/multilanguage/languages/romanian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-russian --train=examples/multilanguage/languages/russian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-scottish_gaelic --train=examples/multilanguage/languages/scottish_gaelic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-serbian --train=examples/multilanguage/languages/serbian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-slovak --train=examples/multilanguage/languages/slovak.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-slovenian --train=examples/multilanguage/languages/slovenian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-spanish --train=examples/multilanguage/languages/spanish.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-swedish --train=examples/multilanguage/languages/swedish.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-swedish_sign_language --train=examples/multilanguage/languages/swedish_sign_language.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-tamil --train=examples/multilanguage/languages/tamil.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-telugu --train=examples/multilanguage/languages/telugu.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-turkish --train=examples/multilanguage/languages/turkish.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-ukrainian --train=examples/multilanguage/languages/ukrainian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-upper_sorbian --train=examples/multilanguage/languages/upper_sorbian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-urdu --train=examples/multilanguage/languages/urdu.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-uyghur --train=examples/multilanguage/languages/uyghur.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-vietnamese --train=examples/multilanguage/languages/vietnamese.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-wolof --train=examples/multilanguage/languages/wolof.json diff --git a/examples/multilanguage/languages/train_parsers.sh b/examples/multilanguage/languages/train_parsers.sh new file mode 100755 index 000000000..6a89380e2 --- /dev/null +++ b/examples/multilanguage/languages/train_parsers.sh @@ -0,0 +1,67 @@ +#!/bin/bash +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-afrikaans --train=examples/multilanguage/languages/afrikaans.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-ancient_greek --train=examples/multilanguage/languages/ancient_greek.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-arabic --train=examples/multilanguage/languages/arabic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-armenian --train=examples/multilanguage/languages/armenian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-basque --train=examples/multilanguage/languages/basque.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-belarusian --train=examples/multilanguage/languages/belarusian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-bulgarian --train=examples/multilanguage/languages/bulgarian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-buryat --train=examples/multilanguage/languages/buryat.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-catalan --train=examples/multilanguage/languages/catalan.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-chinese --train=examples/multilanguage/languages/chinese.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-classical_chinese --train=examples/multilanguage/languages/classical_chinese.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-coptic --train=examples/multilanguage/languages/coptic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-croatian --train=examples/multilanguage/languages/croatian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-czech --train=examples/multilanguage/languages/czech.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-danish --train=examples/multilanguage/languages/danish.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-dutch --train=examples/multilanguage/languages/dutch.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-english --train=examples/multilanguage/languages/english.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-estonian --train=examples/multilanguage/languages/estonian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-finnish --train=examples/multilanguage/languages/finnish.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-french --train=examples/multilanguage/languages/french.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-galician --train=examples/multilanguage/languages/galician.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-german --train=examples/multilanguage/languages/german.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-gothic --train=examples/multilanguage/languages/gothic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-greek --train=examples/multilanguage/languages/greek.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-hebrew --train=examples/multilanguage/languages/hebrew.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-hindi --train=examples/multilanguage/languages/hindi.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-hungarian --train=examples/multilanguage/languages/hungarian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-indonesian --train=examples/multilanguage/languages/indonesian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-irish --train=examples/multilanguage/languages/irish.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-italian --train=examples/multilanguage/languages/italian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-japanese --train=examples/multilanguage/languages/japanese.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-kazakh --train=examples/multilanguage/languages/kazakh.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-korean --train=examples/multilanguage/languages/korean.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-kurmanji --train=examples/multilanguage/languages/kurmanji.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-latin --train=examples/multilanguage/languages/latin.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-latvian --train=examples/multilanguage/languages/latvian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-lithuanian --train=examples/multilanguage/languages/lithuanian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-livvi --train=examples/multilanguage/languages/livvi.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-maltese --train=examples/multilanguage/languages/maltese.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-marathi --train=examples/multilanguage/languages/marathi.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-north_sami --train=examples/multilanguage/languages/north_sami.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-norwegian --train=examples/multilanguage/languages/norwegian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-old_church_slavonic --train=examples/multilanguage/languages/old_church_slavonic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-old_french --train=examples/multilanguage/languages/old_french.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-old_russian --train=examples/multilanguage/languages/old_russian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-persian --train=examples/multilanguage/languages/persian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-polish --train=examples/multilanguage/languages/polish.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-portuguese --train=examples/multilanguage/languages/portuguese.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-romanian --train=examples/multilanguage/languages/romanian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-russian --train=examples/multilanguage/languages/russian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-scottish_gaelic --train=examples/multilanguage/languages/scottish_gaelic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-serbian --train=examples/multilanguage/languages/serbian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-slovak --train=examples/multilanguage/languages/slovak.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-slovenian --train=examples/multilanguage/languages/slovenian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-spanish --train=examples/multilanguage/languages/spanish.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-swedish --train=examples/multilanguage/languages/swedish.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-swedish_sign_language --train=examples/multilanguage/languages/swedish_sign_language.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-tamil --train=examples/multilanguage/languages/tamil.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-telugu --train=examples/multilanguage/languages/telugu.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-turkish --train=examples/multilanguage/languages/turkish.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-ukrainian --train=examples/multilanguage/languages/ukrainian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-upper_sorbian --train=examples/multilanguage/languages/upper_sorbian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-urdu --train=examples/multilanguage/languages/urdu.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-uyghur --train=examples/multilanguage/languages/uyghur.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-vietnamese --train=examples/multilanguage/languages/vietnamese.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-wolof --train=examples/multilanguage/languages/wolof.json diff --git a/examples/multilanguage/languages/train_taggers.sh b/examples/multilanguage/languages/train_taggers.sh new file mode 100755 index 000000000..32f397783 --- /dev/null +++ b/examples/multilanguage/languages/train_taggers.sh @@ -0,0 +1,67 @@ +#!/bin/bash +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-afrikaans --train=examples/multilanguage/languages/afrikaans.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-ancient_greek --train=examples/multilanguage/languages/ancient_greek.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-arabic --train=examples/multilanguage/languages/arabic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-armenian --train=examples/multilanguage/languages/armenian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-basque --train=examples/multilanguage/languages/basque.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-belarusian --train=examples/multilanguage/languages/belarusian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-bulgarian --train=examples/multilanguage/languages/bulgarian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-buryat --train=examples/multilanguage/languages/buryat.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-catalan --train=examples/multilanguage/languages/catalan.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-chinese --train=examples/multilanguage/languages/chinese.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-classical_chinese --train=examples/multilanguage/languages/classical_chinese.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-coptic --train=examples/multilanguage/languages/coptic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-croatian --train=examples/multilanguage/languages/croatian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-czech --train=examples/multilanguage/languages/czech.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-danish --train=examples/multilanguage/languages/danish.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-dutch --train=examples/multilanguage/languages/dutch.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-english --train=examples/multilanguage/languages/english.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-estonian --train=examples/multilanguage/languages/estonian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-finnish --train=examples/multilanguage/languages/finnish.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-french --train=examples/multilanguage/languages/french.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-galician --train=examples/multilanguage/languages/galician.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-german --train=examples/multilanguage/languages/german.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-gothic --train=examples/multilanguage/languages/gothic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-greek --train=examples/multilanguage/languages/greek.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-hebrew --train=examples/multilanguage/languages/hebrew.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-hindi --train=examples/multilanguage/languages/hindi.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-hungarian --train=examples/multilanguage/languages/hungarian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-indonesian --train=examples/multilanguage/languages/indonesian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-irish --train=examples/multilanguage/languages/irish.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-italian --train=examples/multilanguage/languages/italian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-japanese --train=examples/multilanguage/languages/japanese.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-kazakh --train=examples/multilanguage/languages/kazakh.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-korean --train=examples/multilanguage/languages/korean.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-kurmanji --train=examples/multilanguage/languages/kurmanji.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-latin --train=examples/multilanguage/languages/latin.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-latvian --train=examples/multilanguage/languages/latvian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-lithuanian --train=examples/multilanguage/languages/lithuanian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-livvi --train=examples/multilanguage/languages/livvi.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-maltese --train=examples/multilanguage/languages/maltese.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-marathi --train=examples/multilanguage/languages/marathi.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-north_sami --train=examples/multilanguage/languages/north_sami.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-norwegian --train=examples/multilanguage/languages/norwegian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-old_church_slavonic --train=examples/multilanguage/languages/old_church_slavonic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-old_french --train=examples/multilanguage/languages/old_french.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-old_russian --train=examples/multilanguage/languages/old_russian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-persian --train=examples/multilanguage/languages/persian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-polish --train=examples/multilanguage/languages/polish.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-portuguese --train=examples/multilanguage/languages/portuguese.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-romanian --train=examples/multilanguage/languages/romanian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-russian --train=examples/multilanguage/languages/russian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-scottish_gaelic --train=examples/multilanguage/languages/scottish_gaelic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-serbian --train=examples/multilanguage/languages/serbian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-slovak --train=examples/multilanguage/languages/slovak.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-slovenian --train=examples/multilanguage/languages/slovenian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-spanish --train=examples/multilanguage/languages/spanish.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-swedish --train=examples/multilanguage/languages/swedish.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-swedish_sign_language --train=examples/multilanguage/languages/swedish_sign_language.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-tamil --train=examples/multilanguage/languages/tamil.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-telugu --train=examples/multilanguage/languages/telugu.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-turkish --train=examples/multilanguage/languages/turkish.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-ukrainian --train=examples/multilanguage/languages/ukrainian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-upper_sorbian --train=examples/multilanguage/languages/upper_sorbian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-urdu --train=examples/multilanguage/languages/urdu.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-uyghur --train=examples/multilanguage/languages/uyghur.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-vietnamese --train=examples/multilanguage/languages/vietnamese.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-wolof --train=examples/multilanguage/languages/wolof.json diff --git a/examples/multilanguage/languages/train_tokenizers.sh b/examples/multilanguage/languages/train_tokenizers.sh new file mode 100755 index 000000000..ac68f36a5 --- /dev/null +++ b/examples/multilanguage/languages/train_tokenizers.sh @@ -0,0 +1,67 @@ +#!/bin/bash +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-afrikaans --train=examples/multilanguage/languages/afrikaans.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-ancient_greek --train=examples/multilanguage/languages/ancient_greek.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-arabic --train=examples/multilanguage/languages/arabic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-armenian --train=examples/multilanguage/languages/armenian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-basque --train=examples/multilanguage/languages/basque.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-belarusian --train=examples/multilanguage/languages/belarusian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-bulgarian --train=examples/multilanguage/languages/bulgarian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-buryat --train=examples/multilanguage/languages/buryat.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-catalan --train=examples/multilanguage/languages/catalan.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-chinese --train=examples/multilanguage/languages/chinese.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-classical_chinese --train=examples/multilanguage/languages/classical_chinese.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-coptic --train=examples/multilanguage/languages/coptic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-croatian --train=examples/multilanguage/languages/croatian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-czech --train=examples/multilanguage/languages/czech.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-danish --train=examples/multilanguage/languages/danish.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-dutch --train=examples/multilanguage/languages/dutch.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-english --train=examples/multilanguage/languages/english.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-estonian --train=examples/multilanguage/languages/estonian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-finnish --train=examples/multilanguage/languages/finnish.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-french --train=examples/multilanguage/languages/french.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-galician --train=examples/multilanguage/languages/galician.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-german --train=examples/multilanguage/languages/german.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-gothic --train=examples/multilanguage/languages/gothic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-greek --train=examples/multilanguage/languages/greek.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-hebrew --train=examples/multilanguage/languages/hebrew.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-hindi --train=examples/multilanguage/languages/hindi.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-hungarian --train=examples/multilanguage/languages/hungarian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-indonesian --train=examples/multilanguage/languages/indonesian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-irish --train=examples/multilanguage/languages/irish.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-italian --train=examples/multilanguage/languages/italian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-japanese --train=examples/multilanguage/languages/japanese.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-kazakh --train=examples/multilanguage/languages/kazakh.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-korean --train=examples/multilanguage/languages/korean.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-kurmanji --train=examples/multilanguage/languages/kurmanji.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-latin --train=examples/multilanguage/languages/latin.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-latvian --train=examples/multilanguage/languages/latvian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-lithuanian --train=examples/multilanguage/languages/lithuanian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-livvi --train=examples/multilanguage/languages/livvi.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-maltese --train=examples/multilanguage/languages/maltese.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-marathi --train=examples/multilanguage/languages/marathi.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-north_sami --train=examples/multilanguage/languages/north_sami.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-norwegian --train=examples/multilanguage/languages/norwegian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-old_church_slavonic --train=examples/multilanguage/languages/old_church_slavonic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-old_french --train=examples/multilanguage/languages/old_french.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-old_russian --train=examples/multilanguage/languages/old_russian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-persian --train=examples/multilanguage/languages/persian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-polish --train=examples/multilanguage/languages/polish.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-portuguese --train=examples/multilanguage/languages/portuguese.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-romanian --train=examples/multilanguage/languages/romanian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-russian --train=examples/multilanguage/languages/russian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-scottish_gaelic --train=examples/multilanguage/languages/scottish_gaelic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-serbian --train=examples/multilanguage/languages/serbian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-slovak --train=examples/multilanguage/languages/slovak.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-slovenian --train=examples/multilanguage/languages/slovenian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-spanish --train=examples/multilanguage/languages/spanish.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-swedish --train=examples/multilanguage/languages/swedish.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-swedish_sign_language --train=examples/multilanguage/languages/swedish_sign_language.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-tamil --train=examples/multilanguage/languages/tamil.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-telugu --train=examples/multilanguage/languages/telugu.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-turkish --train=examples/multilanguage/languages/turkish.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-ukrainian --train=examples/multilanguage/languages/ukrainian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-upper_sorbian --train=examples/multilanguage/languages/upper_sorbian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-urdu --train=examples/multilanguage/languages/urdu.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-uyghur --train=examples/multilanguage/languages/uyghur.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-vietnamese --train=examples/multilanguage/languages/vietnamese.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-wolof --train=examples/multilanguage/languages/wolof.json diff --git a/examples/multilanguage/languages/turkish.json b/examples/multilanguage/languages/turkish.json new file mode 100644 index 000000000..6d341381f --- /dev/null +++ b/examples/multilanguage/languages/turkish.json @@ -0,0 +1,7 @@ +[ + [ + "tr_imst", + "corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/ukrainian.json b/examples/multilanguage/languages/ukrainian.json new file mode 100644 index 000000000..7de1d2a09 --- /dev/null +++ b/examples/multilanguage/languages/ukrainian.json @@ -0,0 +1,7 @@ +[ + [ + "uk_iu", + "corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/upper_sorbian.json b/examples/multilanguage/languages/upper_sorbian.json new file mode 100644 index 000000000..c02e144b9 --- /dev/null +++ b/examples/multilanguage/languages/upper_sorbian.json @@ -0,0 +1,7 @@ +[ + [ + "hsb_ufal", + "corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/urdu.json b/examples/multilanguage/languages/urdu.json new file mode 100644 index 000000000..453e3cf38 --- /dev/null +++ b/examples/multilanguage/languages/urdu.json @@ -0,0 +1,7 @@ +[ + [ + "ur_udtb", + "corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/uyghur.json b/examples/multilanguage/languages/uyghur.json new file mode 100644 index 000000000..db05c4c7f --- /dev/null +++ b/examples/multilanguage/languages/uyghur.json @@ -0,0 +1,7 @@ +[ + [ + "ug_udt", + "corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/vietnamese.json b/examples/multilanguage/languages/vietnamese.json new file mode 100644 index 000000000..abef5dc65 --- /dev/null +++ b/examples/multilanguage/languages/vietnamese.json @@ -0,0 +1,7 @@ +[ + [ + "vi_vtb", + "corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/languages/wolof.json b/examples/multilanguage/languages/wolof.json new file mode 100644 index 000000000..5ef8afec9 --- /dev/null +++ b/examples/multilanguage/languages/wolof.json @@ -0,0 +1,7 @@ +[ + [ + "wo_wtb", + "corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-train.conllu", + "corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-dev.conllu" + ] +] \ No newline at end of file diff --git a/examples/multilanguage/train_compound.sh b/examples/multilanguage/train_compound.sh new file mode 100755 index 000000000..3473124f1 --- /dev/null +++ b/examples/multilanguage/train_compound.sh @@ -0,0 +1,29 @@ +#!/bin/bash +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-afro-asiatic --train=examples/multilanguage/autogenerated/afro-asiatic.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-albanian --train=examples/multilanguage/autogenerated/albanian.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-armenian --train=examples/multilanguage/autogenerated/armenian.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-austro-asiatic --train=examples/multilanguage/autogenerated/austro-asiatic.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-austronesian --train=examples/multilanguage/autogenerated/austronesian.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-baltic --train=examples/multilanguage/autogenerated/baltic.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-basque --train=examples/multilanguage/autogenerated/basque.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-celtic --train=examples/multilanguage/autogenerated/celtic.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-creole --train=examples/multilanguage/autogenerated/creole.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-dravidian --train=examples/multilanguage/autogenerated/dravidian.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-germanic --train=examples/multilanguage/autogenerated/germanic.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-greek --train=examples/multilanguage/autogenerated/greek.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-indic --train=examples/multilanguage/autogenerated/indic.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-iranian --train=examples/multilanguage/autogenerated/iranian.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-japanese --train=examples/multilanguage/autogenerated/japanese.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-korean --train=examples/multilanguage/autogenerated/korean.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-latin --train=examples/multilanguage/autogenerated/latin.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-mande --train=examples/multilanguage/autogenerated/mande.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-mongolic --train=examples/multilanguage/autogenerated/mongolic.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-niger-congo --train=examples/multilanguage/autogenerated/niger-congo.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-pama-nyungan --train=examples/multilanguage/autogenerated/pama-nyungan.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-romance --train=examples/multilanguage/autogenerated/romance.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-sino-tibetan --train=examples/multilanguage/autogenerated/sino-tibetan.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-slavic --train=examples/multilanguage/autogenerated/slavic.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-tai-kadai --train=examples/multilanguage/autogenerated/tai-kadai.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-tupian --train=examples/multilanguage/autogenerated/tupian.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-turkic --train=examples/multilanguage/autogenerated/turkic.json +python3 cube/networks/compound.py --batch-size=64 --device=cuda:0 --store=data/compound-uralic --train=examples/multilanguage/autogenerated/uralic.json diff --git a/examples/multilanguage/train_lemmatizers.sh b/examples/multilanguage/train_lemmatizers.sh new file mode 100755 index 000000000..dd919d523 --- /dev/null +++ b/examples/multilanguage/train_lemmatizers.sh @@ -0,0 +1,29 @@ +#!/bin/bash +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-afro-asiatic --train=examples/multilanguage/autogenerated/afro-asiatic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-albanian --train=examples/multilanguage/autogenerated/albanian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-armenian --train=examples/multilanguage/autogenerated/armenian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-austro-asiatic --train=examples/multilanguage/autogenerated/austro-asiatic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-austronesian --train=examples/multilanguage/autogenerated/austronesian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-baltic --train=examples/multilanguage/autogenerated/baltic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-basque --train=examples/multilanguage/autogenerated/basque.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-celtic --train=examples/multilanguage/autogenerated/celtic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-creole --train=examples/multilanguage/autogenerated/creole.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-dravidian --train=examples/multilanguage/autogenerated/dravidian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-germanic --train=examples/multilanguage/autogenerated/germanic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-greek --train=examples/multilanguage/autogenerated/greek.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-indic --train=examples/multilanguage/autogenerated/indic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-iranian --train=examples/multilanguage/autogenerated/iranian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-japanese --train=examples/multilanguage/autogenerated/japanese.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-korean --train=examples/multilanguage/autogenerated/korean.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-latin --train=examples/multilanguage/autogenerated/latin.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-mande --train=examples/multilanguage/autogenerated/mande.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-mongolic --train=examples/multilanguage/autogenerated/mongolic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-niger-congo --train=examples/multilanguage/autogenerated/niger-congo.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-pama-nyungan --train=examples/multilanguage/autogenerated/pama-nyungan.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-romance --train=examples/multilanguage/autogenerated/romance.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-sino-tibetan --train=examples/multilanguage/autogenerated/sino-tibetan.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-slavic --train=examples/multilanguage/autogenerated/slavic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-tai-kadai --train=examples/multilanguage/autogenerated/tai-kadai.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-tupian --train=examples/multilanguage/autogenerated/tupian.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-turkic --train=examples/multilanguage/autogenerated/turkic.json +python3 cube/networks/lemmatizer.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-uralic --train=examples/multilanguage/autogenerated/uralic.json diff --git a/examples/multilanguage/train_parsers.sh b/examples/multilanguage/train_parsers.sh new file mode 100755 index 000000000..0741a7a38 --- /dev/null +++ b/examples/multilanguage/train_parsers.sh @@ -0,0 +1,29 @@ +#!/bin/bash +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-afro-asiatic --train=examples/multilanguage/autogenerated/afro-asiatic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-albanian --train=examples/multilanguage/autogenerated/albanian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-armenian --train=examples/multilanguage/autogenerated/armenian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-austro-asiatic --train=examples/multilanguage/autogenerated/austro-asiatic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-austronesian --train=examples/multilanguage/autogenerated/austronesian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-baltic --train=examples/multilanguage/autogenerated/baltic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-basque --train=examples/multilanguage/autogenerated/basque.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-celtic --train=examples/multilanguage/autogenerated/celtic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-creole --train=examples/multilanguage/autogenerated/creole.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-dravidian --train=examples/multilanguage/autogenerated/dravidian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-germanic --train=examples/multilanguage/autogenerated/germanic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-greek --train=examples/multilanguage/autogenerated/greek.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-indic --train=examples/multilanguage/autogenerated/indic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-iranian --train=examples/multilanguage/autogenerated/iranian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-japanese --train=examples/multilanguage/autogenerated/japanese.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-korean --train=examples/multilanguage/autogenerated/korean.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-latin --train=examples/multilanguage/autogenerated/latin.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-mande --train=examples/multilanguage/autogenerated/mande.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-mongolic --train=examples/multilanguage/autogenerated/mongolic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-niger-congo --train=examples/multilanguage/autogenerated/niger-congo.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-pama-nyungan --train=examples/multilanguage/autogenerated/pama-nyungan.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-romance --train=examples/multilanguage/autogenerated/romance.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-sino-tibetan --train=examples/multilanguage/autogenerated/sino-tibetan.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-slavic --train=examples/multilanguage/autogenerated/slavic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-tai-kadai --train=examples/multilanguage/autogenerated/tai-kadai.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-tupian --train=examples/multilanguage/autogenerated/tupian.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-turkic --train=examples/multilanguage/autogenerated/turkic.json +python3 cube/networks/parser.py --batch-size=4 --device=cuda:0 --store=data/parser-uralic --train=examples/multilanguage/autogenerated/uralic.json \ No newline at end of file diff --git a/examples/multilanguage/train_taggers.sh b/examples/multilanguage/train_taggers.sh new file mode 100755 index 000000000..f9922fe99 --- /dev/null +++ b/examples/multilanguage/train_taggers.sh @@ -0,0 +1,29 @@ +#!/bin/bash +python3 cube/networks/tagger.py --batch-size=8 --device=cpu --store=data/tagger-afro-asiatic --train=examples/multilanguage/autogenerated/afro-asiatic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-albanian --train=examples/multilanguage/autogenerated/albanian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-armenian --train=examples/multilanguage/autogenerated/armenian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-austro-asiatic --train=examples/multilanguage/autogenerated/austro-asiatic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-austronesian --train=examples/multilanguage/autogenerated/austronesian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-baltic --train=examples/multilanguage/autogenerated/baltic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-basque --train=examples/multilanguage/autogenerated/basque.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-celtic --train=examples/multilanguage/autogenerated/celtic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-creole --train=examples/multilanguage/autogenerated/creole.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-dravidian --train=examples/multilanguage/autogenerated/dravidian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-germanic --train=examples/multilanguage/autogenerated/germanic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-greek --train=examples/multilanguage/autogenerated/greek.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-indic --train=examples/multilanguage/autogenerated/indic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-iranian --train=examples/multilanguage/autogenerated/iranian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-japanese --train=examples/multilanguage/autogenerated/japanese.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-korean --train=examples/multilanguage/autogenerated/korean.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-latin --train=examples/multilanguage/autogenerated/latin.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-mande --train=examples/multilanguage/autogenerated/mande.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-mongolic --train=examples/multilanguage/autogenerated/mongolic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-niger-congo --train=examples/multilanguage/autogenerated/niger-congo.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-pama-nyungan --train=examples/multilanguage/autogenerated/pama-nyungan.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-romance --train=examples/multilanguage/autogenerated/romance.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-sino-tibetan --train=examples/multilanguage/autogenerated/sino-tibetan.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-slavic --train=examples/multilanguage/autogenerated/slavic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-tai-kadai --train=examples/multilanguage/autogenerated/tai-kadai.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-tupian --train=examples/multilanguage/autogenerated/tupian.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-turkic --train=examples/multilanguage/autogenerated/turkic.json +python3 cube/networks/tagger.py --batch-size=8 --device=cuda:0 --store=data/tagger-uralic --train=examples/multilanguage/autogenerated/uralic.json \ No newline at end of file diff --git a/examples/multilanguage/train_tokenizers.sh b/examples/multilanguage/train_tokenizers.sh new file mode 100755 index 000000000..2332e16f8 --- /dev/null +++ b/examples/multilanguage/train_tokenizers.sh @@ -0,0 +1,30 @@ +#!/bin/bash +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-afro-asiatic --train=examples/multilanguage/autogenerated/afro-asiatic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-albanian --train=examples/multilanguage/autogenerated/albanian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-armenian --train=examples/multilanguage/autogenerated/armenian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-austro-asiatic --train=examples/multilanguage/autogenerated/austro-asiatic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-austronesian --train=examples/multilanguage/autogenerated/austronesian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-baltic --train=examples/multilanguage/autogenerated/baltic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-basque --train=examples/multilanguage/autogenerated/basque.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-celtic --train=examples/multilanguage/autogenerated/celtic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-creole --train=examples/multilanguage/autogenerated/creole.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-dravidian --train=examples/multilanguage/autogenerated/dravidian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-germanic --train=examples/multilanguage/autogenerated/germanic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-greek --train=examples/multilanguage/autogenerated/greek.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-indic --train=examples/multilanguage/autogenerated/indic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-iranian --train=examples/multilanguage/autogenerated/iranian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-japanese --train=examples/multilanguage/autogenerated/japanese.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-korean --train=examples/multilanguage/autogenerated/korean.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-latin --train=examples/multilanguage/autogenerated/latin.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-mande --train=examples/multilanguage/autogenerated/mande.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-mongolic --train=examples/multilanguage/autogenerated/mongolic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-niger-congo --train=examples/multilanguage/autogenerated/niger-congo.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-pama-nyungan --train=examples/multilanguage/autogenerated/pama-nyungan.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-romance --train=examples/multilanguage/autogenerated/romance.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-sino-tibetan --train=examples/multilanguage/autogenerated/sino-tibetan.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-slavic --train=examples/multilanguage/autogenerated/slavic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-tai-kadai --train=examples/multilanguage/autogenerated/tai-kadai.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-tupian --train=examples/multilanguage/autogenerated/tupian.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-turkic --train=examples/multilanguage/autogenerated/turkic.json +python3 cube/networks/tokenizer.py --batch-size=16 --device=cuda:0 --store=data/tokenizer-uralic --train=examples/multilanguage/autogenerated/uralic.json + diff --git a/requirements.txt b/requirements.txt index b3870937e..bf5daffe6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,9 @@ future>=0.16.0 scipy>=1.0.0 nltk>=3.2.5 requests>=2.18.4 -dyNET flask beautifulsoup4 +numpy # Parse XML xmltodict==0.11.0 @@ -13,3 +13,11 @@ xmltodict==0.11.0 # Testing nose2==0.7.3 regex +torch +tqdm +configparser +pytorch_lightning==1.2.10 + +transformers==4.2.2 +sentencepiece +fasttext==0.9.2 diff --git a/scripts/_del_pack_models_for_upload.py b/scripts/_del_pack_models_for_upload.py new file mode 100644 index 000000000..cebf90749 --- /dev/null +++ b/scripts/_del_pack_models_for_upload.py @@ -0,0 +1,138 @@ +""" this needs to be deleted """ + +import json, os, sys +# Append parent dir to sys path. +from shutil import rmtree, copyfile +import logging +from tqdm.autonotebook import tqdm as tqdm + +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +os.sys.path.insert(0, parent_dir) + +from cube.io_utils.modelstore import ModelStore + +def check (filename, language_family, files): + for file in files: + if filename in file: + return 1 + print(f"{language_family} is missing [{filename}].\n") + with open("log.txt", "a") as f: + f.write(f"{language_family} is missing [{filename}].\n") + return 0 + +if __name__ == "__main__": + refresh = False + folder_with_base_confs = os.path.abspath("scripts//train//2.7//language//") + folder_with_all_trained_models = "/media/echo/5CA436CBA436A802/work/models" #os.path.abspath("models") + folder_where_to_output_everything = "/media/echo/5CA436CBA436A802/work/nlp-cube-models" #os.path.abspath("nlp-cube-models") + url_root_for_models = "https://github.com/adobe/NLP-Cube-Models/blob/3.0/models/" # !! make sure it ends with an / + + logger = logging.getLogger("cube") + log_handler = logging.StreamHandler() + log_formatter = logging.Formatter( + fmt="[%(levelname)8s | %(asctime)s | %(filename)-20s:%(lineno)3s | %(funcName)-26s] %(message)s", + datefmt='%Y-%m-%d %H:%M:%S') + log_handler.setFormatter(log_formatter) + logger.addHandler(log_handler) + logger.setLevel(logging.DEBUG) + + + """ + Read base confs and get a list of language families + """ + language_family_confs = os.listdir(folder_with_base_confs) # this just lists the files without path + language_family_confs = [os.path.abspath(os.path.join(folder_with_base_confs, x)) for x in language_family_confs] + language_family_confs = [x for x in language_family_confs if os.path.isfile(x) and x.endswith(".yaml")] + print("I see {} language families in {}.".format(len(language_family_confs), folder_with_base_confs)) + + d = {} + for f in language_family_confs: + js = json.load(open(f, "r")) + # list of lists where [0] is the lang code + _, name = os.path.split(f) + if name == "all.json": + continue + d[name.replace(".json","")] = set() + + for e in js: + p = e[0] + d[name.replace(".json", "")].add(p) + if "_" in p: + d[name.replace(".json", "")].add(p.split("_")[0]) + if len(d[name.replace(".json","")]) == 0: + del d[name.replace(".json","")] + else: + print("\tlanguage family {} has {} codes: {}".format(name.replace(".json",""), len(d[name.replace(".json","")]), d[name.replace(".json","")])) + + """ + For each language family, create a folder with the packed and split model in output folder + Mark language family as valid + """ + import uuid + + catalog = {} + catalog["default"] = [] + + os.makedirs(folder_where_to_output_everything, exist_ok=True) + for fam in d: + print("Processing language family {}:".format(fam)) + + # copy all files in a temp folder + model_files = os.listdir(folder_with_all_trained_models) + #print(model_files) + model_files = [x for x in model_files if x.startswith(fam+"-") and ".last" not in x and ".zip" not in x] + + # check model is complete + list_of_files = [ + "tokenizer.config", + "tokenizer.encodings", + "tokenizer.yaml", + + ] + #check("") + + #print(model_files) + model_files = [os.path.abspath(os.path.join(folder_with_all_trained_models,x)) for x in model_files] + if len(model_files) == 0: + print("\t no model files found, skipping ...") + continue + print("\t found {} model files, copying to temp folder ...".format(len(model_files))) + + temp_folder = os.path.join(folder_where_to_output_everything, str(uuid.uuid4().hex)) + os.mkdir(temp_folder) + for src_file in tqdm(model_files): + _, name = os.path.split(src_file) + dst_file = os.path.join(temp_folder, name) + #print((src_file, dst_file)) + copyfile(src_file, dst_file) + + # pack folder in zip file + print("\t packing model ... ") + zip_file_path = os.path.join(temp_folder, fam+".zip") + ModelStore._pack_model(input_folder = temp_folder, output_file_path = zip_file_path) + + # split for upload to github + print("\t splitting model zip ...") + output_folder = os.path.join(folder_where_to_output_everything, fam) + os.makedirs(output_folder, exist_ok=True) + ModelStore._split_packed_model(file_path = zip_file_path, output_folder = output_folder) + + # delete temp folder + print("\t deleting temp folder ...") + if os.path.exists(temp_folder): + rmtree(temp_folder, ignore_errors=True) + + # make a catalog entry + entry = {} + entry["languages"] = list(d[fam]) + entry["model"] = url_root_for_models + fam + catalog["default"].append(entry) + + + """ + Generate catalog + """ + print("Writing catalog ...") + json.dump(catalog, open(os.path.join(folder_where_to_output_everything, "catalog.json"), "w", encoding="utf8"), indent=4, sort_keys=True) + + print("Done.") diff --git a/scripts/default_language.txt b/scripts/default_language.txt new file mode 100644 index 000000000..3bd8331df --- /dev/null +++ b/scripts/default_language.txt @@ -0,0 +1,73 @@ +af afribooms +grc proiel +ar padt +hy armtdp +eu bdt +be hse +bg btb +bxr bdt +ca ancora +zh gsdsimp +zh-hant gsd +lzh kyoto +cop scriptorium +hr set +cs pdt +da ddt +nl alpino +en ewt +et edt +fi tdt +fr gsd +gl ctg +de hdt +got proiel +el gdt +he htb +hi hdtb +hu szeged +id gsd +ga idt +it isdt +is icepahc +ja gsd +kk ktb +ko gsd +kmr mg +la ittb +lv lvtb +lt alksnis +olo kkpp +mt mudt +mr ufal +sme giella +no bokmaal +nn nynorsk +cu proiel +fro srcmf +orv torot +fa seraji +pl pdb +pt bosque +ro rrt +ru syntagrus +gd arcosg +sr set +sk snk +sl ssj +es gsd +sv talbanken +swl sslc +ta ttb +te mtg +tr imst +uk iu +hsb ufal +ur udtb +ug udt +vi vtb +wo wtb +fo oft +pcm nsc +sa vedic +cy ccg \ No newline at end of file diff --git a/scripts/generate_catalog_from_models.py b/scripts/generate_catalog_from_models.py new file mode 100644 index 000000000..50dd2d2f8 --- /dev/null +++ b/scripts/generate_catalog_from_models.py @@ -0,0 +1,174 @@ +import json, os, yaml, uuid +import ntpath +from shutil import rmtree, copyfile +import logging +from tqdm.autonotebook import tqdm as tqdm + +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +os.sys.path.insert(0, parent_dir) + +from cube.io_utils.modelstore import ModelStore + +if __name__ == "__main__": + # set vars here + VERSION = "1.0" + FOLDER_WITH_YAMLS = os.path.abspath("scripts//train//2.7//language//") + FOLDER_WITH_TRAINED_MODELS = "/media/echo/5CA436CBA436A802/work/models" # os.path.abspath("models") + FOLDER_WHERE_TO_OUTPUT_EVERYTHING = "/media/echo/5CA436CBA436A802/work/nlp-cube-models" # os.path.abspath("nlp-cube-models") + URL_ROOT_FOR_MODELS = "https://raw.githubusercontent.com/adobe/NLP-Cube-Models/3.0/models/" # !! make sure it ends with a / + + """ + 0. Open existing catalog, create new key "version" if it does not exist + + 1. Load all yaml files: + lang_code_id : "lang_code" -> index in original yaml file + lang_map : "name" -> "lang_code" + + 2. For each lang_code, check to see if files are available to form a model. If not, report errors. + + 3. Copy all files in temp folder + + 4. Pack all files and split in shards + + 5. Move in final model dir with name lang_code.version.### + + 6. For all entries in lang_map, if lang_code is the current one, create entry in catalog if does not exist + "name":["version","link","lang_code_index"] + + + 7. Write catalog + """ + + logger = logging.getLogger("cube") + log_handler = logging.StreamHandler() + log_formatter = logging.Formatter( + fmt="[%(levelname)8s | %(asctime)s | %(filename)-20s:%(lineno)3s | %(funcName)-26s] %(message)s", + datefmt='%Y-%m-%d %H:%M:%S') + log_handler.setFormatter(log_formatter) + logger.addHandler(log_handler) + logger.setLevel(logging.DEBUG) + + + + # STEP 0 - open current catalog or create new + catalog_path = os.path.join(FOLDER_WHERE_TO_OUTPUT_EVERYTHING, "catalog.json") + if os.path.exists(catalog_path): + catalog = json.load(open(catalog_path, "r", encoding="utf8")) + else: + catalog = {} + + + # STEP 1 - load all yamls + langcode_to_index = {} + name_to_langcode = {} + + yamls = os.listdir(FOLDER_WITH_YAMLS) # this just lists the files without path + yamls = [os.path.abspath(os.path.join(FOLDER_WITH_YAMLS, x)) for x in yamls] # fill full path in + yamls = [x for x in yamls if os.path.isfile(x) and x.endswith(".yaml")] # filter out possible junk + print("I see {} yamls in {}.".format(len(yamls), FOLDER_WITH_YAMLS)) + + for yaml_file in yamls: + y = yaml.safe_load(open(yaml_file, "r")) + + for index, lang_code in enumerate(y["language_codes"]): + if lang_code in langcode_to_index: + print(f"Warning! lang_code {lang_code} already found in langcode_to_index!") + langcode_to_index[lang_code] = index + + for name in y["language_map"]: + if name in name_to_langcode: + print(f"Warning! name {name} already found in name_to_langcode!") + name_to_langcode[name] = y["language_map"][name] + + # STEP 2 - for each lang_code + for lang_code in langcode_to_index: + print(f"Working on {lang_code} ...") + + # STEP 3 - copy all relevant files for this language_code in temp folder + files = os.listdir(FOLDER_WITH_TRAINED_MODELS) # this just lists the files without path + files = [os.path.abspath(os.path.join(FOLDER_WITH_TRAINED_MODELS, x)) for x in files] # fill full path in + files = [x for x in files if os.path.isfile(x) and ".last" not in x] + + # get major language_code + tt = [x for x in files if lang_code in x] + if len(tt) == 0: + print(f"\tCould not get major language code!") + continue + tt = tt[0] + tt = ntpath.basename(tt) + major_lang_code = tt.split("-")[0] + print(f"\t major language code is [{major_lang_code}]") + + valid_files = [] + # copy encodings and config + for f in files: + ff = ntpath.basename(f) + if ff.startswith(major_lang_code+"-") and (".config" in ff or ".encodings" in ff): + valid_files.append(f) + # copy lang_codes with tok, best and las + for f in files: + if lang_code in f and (".tok" in f or ".best" in f or ".las" in f): + valid_files.append(f) + files = valid_files + + # check they are valid + found_tokenizer, found_lemmatizer, found_parser = False, False, False + for f in files: + if "tokenizer" in f: + found_tokenizer = True + if "lemmatizer" in f: + found_lemmatizer = True + if "parser" in f: + found_parser = True + + if not(found_tokenizer and found_lemmatizer and found_parser): + print(f"\t {lang_code} does not have all files: tokenizer={found_tokenizer}, lemmatizer={found_lemmatizer}, parser={found_parser}, skipping") + with open("log.txt", "a") as f: + f.write(f"\t {lang_code} does not have all files: tokenizer={found_tokenizer}, lemmatizer={found_lemmatizer}, parser={found_parser}, skipping\n") + continue + + temp_folder = os.path.join(FOLDER_WHERE_TO_OUTPUT_EVERYTHING, str(uuid.uuid4().hex)) + os.mkdir(temp_folder) + + # copy files to temp folder + print("\t copying files to temp folder ... ") + for src_file in files: + _, name = os.path.split(src_file) + dst_file = os.path.join(temp_folder, name) + # print((src_file, dst_file)) + copyfile(src_file, dst_file) + + # pack folder in zip file + zip_file_path = os.path.join(temp_folder, lang_code + "-" + VERSION) + split_count = ModelStore._pack_model( + input_folder=temp_folder, + output_folder=FOLDER_WHERE_TO_OUTPUT_EVERYTHING, + model_name=lang_code + "-" + VERSION, + split_size_in_mb=99) + + # delete temp folder + print("\t deleting temp folder ...") + if os.path.exists(temp_folder): + rmtree(temp_folder, ignore_errors=True) + + # STEP 7 - make a catalog entry for all language names affected : + entry = { + "version": VERSION, + "link": URL_ROOT_FOR_MODELS + lang_code + "-" + VERSION, + "langid": langcode_to_index[lang_code], + "parts": split_count + } + + for name in name_to_langcode: + if name_to_langcode[name] == lang_code: + print(f"\t making a catalog entry for [{name}] -> [{lang_code}], {split_count} parts, langid {langcode_to_index[lang_code]}") + if name not in catalog: + catalog[name] = [] + + catalog[name].append(entry) + + print("Finished processing all language codes, writing catalog ... ") + + json.dump(catalog, open(catalog_path, "w", encoding="utf8"), indent=4, sort_keys=True) + + print("Done.") diff --git a/scripts/generate_training_yamls.py b/scripts/generate_training_yamls.py new file mode 100644 index 000000000..190a680a1 --- /dev/null +++ b/scripts/generate_training_yamls.py @@ -0,0 +1,311 @@ +import sys, os, yaml +from pprint import pprint +from bs4 import BeautifulSoup + +path_to_corpus_folder = "corpus/ud-treebanks-v2.5" +path_to_save_folder = "scripts/train/2.5" + +path_to_corpus_folder = "corpus/ud-treebanks-v2.7/" +path_to_save_folder = "scripts/train/2.7" + +# read default language map +default_language_map = {} +with open("scripts/default_language.txt","r") as f: + lines = f.readlines() + for l in lines: + p = l.strip().split("\t") + major, treebank = p[0], p[0]+"_"+p[1] + default_language_map[major] = treebank + +# read UD extracted table file as html <-- copy-paste from UD's website of the table only +with open("scripts/ud_table.html","r", encoding="utf8") as f: + source = f.read() + +soup = BeautifulSoup(source, features="html.parser") +table = [] + +##### STEP 1, extract data from html table +header = True +for tag in soup.div.findChildren(recursive=False): + if str(tag).strip() == "" or len(str(tag))<50: + continue + if header: + #print(tag) + for elem in tag.find_all("span", class_="doublewidespan", recursive=False): + if elem.text.strip() != "": + language_name = elem.text + print("Language name: "+ language_name) + + for elem in tag.find_all("span", class_="triplewidespan", recursive=False): + if elem.text.strip() != "": + language_family = [x.strip() for x in elem.text.split(",")] + print("Language family: {}".format(language_family)) + + header = False + else: + cheader = True + for content in tag.find_all("div", class_="ui-accordion-icons", recursive=False): # just 1 content + for content_div in content.findChildren(recursive=False): # list of divs + #print(content_div) + #print("_" * 50) + + if cheader: # get name of treebank + for sp in content_div.find_all("span", class_="doublewidespan", recursive=False): + #print(sp) + if sp.text.strip() != "": + treebank_name = sp.text + print("Treebank name: "+treebank_name) + cheader = False + else: # get folder name + for sp in content_div.find_all("a", href=True): + #print(sp["href"]) + link = str(sp["href"]).strip() + if "UniversalDependencies" in link: + link = link[41:] + treebank_folder = link[:link.find("/")] + elif "treebanks/" in link: + link = link[10:] + link = link[:link.find("/")] + language_code = link + major_language_code = link.split("_")[0] + + print("Treebank folder: " + treebank_folder) + print("Language code: " + language_code) + print("Major language code: " + major_language_code) + + if language_name == None: + raise Exception("Language name is none") + if language_family == None: + raise Exception("Language family none") + if treebank_name == None: + raise Exception("Treebank name is none") + if treebank_folder == None: + raise Exception("Treebank folder is none") + + telem = {} + telem["language_name"] = language_name + telem["language_family"] = language_family + telem["treebank_name"] = treebank_name + telem["treebank_folder"] = treebank_folder + telem["language_code"] = language_code + telem["major_language_code"] = major_language_code + print(telem) + if "simonero" in language_code: + continue + if "_pud" in language_code: + continue + table.append(telem) + treebank_name, treebank_folder, language_code, major_language_code = None, None, None, None + + cheader = True + + # reset for next language + language_name, language_family, treebank_name, treebank_folder = None, None, None, None + + header = True + #input('\n\n----------\n\n') + +##### STEP 2, add paths to train dev test +valid = [] +for l in table: + train_file = os.path.join(path_to_corpus_folder, l["treebank_folder"], l["language_code"] + "-ud-train.conllu") + dev_file = os.path.join(path_to_corpus_folder, l["treebank_folder"], l["language_code"] + "-ud-dev.conllu") + test_file = os.path.join(path_to_corpus_folder, l["treebank_folder"], l["language_code"] + "-ud-test.conllu") + + # for PUD treebanks + if l["language_code"].endswith("pud"): # languages only with test files + l["train_file"] = [l["language_code"],train_file] + l["dev_file"] = None + l["test_file"] = None + else: # regular treebank + if os.path.exists(train_file): + l["train_file"] = [l["language_code"],train_file] + else: + l["train_file"] = None + if os.path.exists(dev_file): + l["dev_file"] = [l["language_code"],dev_file] + else: + l["dev_file"] = None + if os.path.exists(test_file): + if l["dev_file"] == None: + l["dev_file"] = [l["language_code"],test_file] + l["test_file"] = [l["language_code"],test_file] + else: + l["test_file"] = [l["language_code"],test_file] + else: + l["test_file"] = None + + if l["train_file"] is not None: + valid.append(l) + else: + print(" Language skipped due to no training file: ") + pprint(l) + print("\n") + +table = valid + + +#### STEP 3, generate train files + +# generate single language files +print("\n\nRunning single treebank ...\n") +folder = os.path.join(path_to_save_folder, "treebank") +os.makedirs(folder, exist_ok=True) + +for l in table: + obj = { + "language_map": {l["language_code"]:l["language_code"]}, + "language_codes": [l["language_code"]], + "train_files": {}, + "dev_files": {}, + "test_files": {} + } + + if l["train_file"] is not None: + obj["train_files"][l["train_file"][0]] = l["train_file"][1] + if l["dev_file"] is not None: + obj["dev_files"][l["dev_file"][0]] = l["dev_file"][1] + if l["test_file"] is not None: + obj["test_files"][l["test_file"][0]] = l["test_file"][1] + else: + if l["dev_file"] is not None: + obj["test_files"][l["dev_file"][0]] = l["dev_file"][1] + + obj["language_codes"] = list(set([obj["language_map"][key] for key in obj["language_map"]])) + + if len(obj["train_files"]) == 0 : + print("\tLanguage {} has zero training files, skipping. ".format(l["language_code"])) + continue + if len(obj["dev_files"]) == 0 : + print("\tLanguage {} has zero dev files, skipping. ".format(l["language_code"])) + continue + if len(obj["test_files"]) == 0: + print("\tLanguage {} has zero test files, skipping. ".format(l["language_code"])) + continue + + filename = l["language_code"]+".yaml" + with open(os.path.join(folder, filename), 'w') as f: + data = yaml.dump(obj, f, sort_keys=True) + +# generate per major language files (e.g. en) +print("\n\nRunning per language treebank aggregation ...\n") +folder = os.path.join(path_to_save_folder, "language") +os.makedirs(folder, exist_ok=True) + +groups = {} +for l in table: + lc = l["major_language_code"] + if lc not in groups: + groups[lc] = [] + groups[lc].append(l) + +for g in groups: + if g not in default_language_map: + #raise Exception("\t\tLanguage {} does not have default language code, skipping!".format(g)) + print("\tLanguage {} ({}) does not have default language code, skipping!".format(groups[g][0]["language_name"], g)) + continue + + obj = { + "language_map": {}, + "language_codes": [], + "train_files": {}, + "dev_files": {}, + "test_files": {} + } + for l in groups[g]: + # add full language name, lowercased + if l["language_name"].lower() not in obj["language_map"]: + obj["language_map"][l["language_name"].lower()] = default_language_map[g] + # add major language code + if g not in obj["language_map"]: + obj["language_map"][g] = default_language_map[g] + # add treebank code + obj["language_map"][l["language_code"]] = l["language_code"] + + if l["train_file"] is not None: + obj["train_files"][l["train_file"][0]] = l["train_file"][1] + if l["dev_file"] is not None: + obj["dev_files"][l["dev_file"][0]] = l["dev_file"][1] + if l["test_file"] is not None: + obj["test_files"][l["test_file"][0]] = l["test_file"][1] + + obj["language_codes"] = list(set([obj["language_map"][key] for key in obj["language_map"]])) + + if len(obj["train_files"]) == 0 : + print("\tLanguage {} ({}) has zero training files, skipping. ".format(l["language_name"], g)) + continue + if len(obj["dev_files"]) == 0 : + print("\tLanguage {} ({}) has zero dev files, skipping. ".format(l["language_name"], g)) + continue + if len(obj["test_files"]) == 0: + print("\tLanguage {} ({}) has zero test files, skipping. ".format(l["language_name"], g)) + continue + + filename = l["major_language_code"]+".yaml" + with open(os.path.join(folder, filename), 'w') as f: + data = yaml.dump(obj, f, sort_keys=True) + +# generate per language family +print("\n\nRunning per language family aggregation ...\n") +folder = os.path.join(path_to_save_folder, "family") +os.makedirs(folder, exist_ok=True) + +groups = {} +for l in table: + if l["major_language_code"] not in default_language_map: + #raise Exception("\t\tLanguage {} does not have default language code, skipping!".format(g)) + print("\tLanguage {} ({}) does not have default language code, skipping!".format(l["language_name"], l["major_language_code"])) + continue + + families = l["language_family"] + for f in families: + if f != "IE" and "Afro-Asiatic" not in f: + if f not in groups: + groups[f] = [] + + groups[f].append(l) + +for g in groups: + obj = { + "language_map": {}, + "language_codes": [], + "train_files": {}, + "dev_files": {}, + "test_files": {} + } + + for l in groups[g]: + # add full language name, lowercased + if l["language_name"].lower() not in obj["language_map"]: + obj["language_map"][l["language_name"].lower()] = default_language_map[l["major_language_code"]] + # add major language code + if g not in obj["language_map"]: + obj["language_map"][l["major_language_code"]] = default_language_map[l["major_language_code"]] + # add treebank code + obj["language_map"][l["language_code"]] = l["language_code"] + + if l["train_file"] is not None: + obj["train_files"][l["train_file"][0]] = l["train_file"][1] + if l["dev_file"] is not None: + obj["dev_files"][l["dev_file"][0]] = l["dev_file"][1] + if l["test_file"] is not None: + obj["test_files"][l["test_file"][0]] = l["test_file"][1] + + obj["language_codes"] = list(set([obj["language_map"][key] for key in obj["language_map"]])) + + if len(obj["train_files"]) == 0 : + print("\tLanguage {} (fam: {}) has zero training files, skipping. ".format(l["language_name"], g)) + pprint(obj) + continue + if len(obj["dev_files"]) == 0 : + print("\tLanguage {} (fam: {}) has zero dev files, skipping. ".format(l["language_name"], g)) + pprint(obj) + continue + if len(obj["test_files"]) == 0: + print("\tLanguage {} (fam: {}) has zero test files, skipping. ".format(l["language_name"], g)) + pprint(obj) + continue + + filename = g.lower()+".yaml" + with open(os.path.join(folder, filename), 'w') as f: + yaml.dump(obj, f, sort_keys=True) diff --git a/scripts/list2train.py b/scripts/list2train.py new file mode 100644 index 000000000..4c3ff60ed --- /dev/null +++ b/scripts/list2train.py @@ -0,0 +1,218 @@ +import json +import sys +import optparse + +sys.path.append('') + + +def _is_complete_corpus(filename): + from cube.io_utils.conll import Dataset + dataset = Dataset() + dataset.load_language(filename, 0) + ok = False + for entry in dataset.sequences[0][0]: + word = entry.word.replace('_', '').strip() + if word != '': + ok = True + break + return ok + + +def _get_list_of_folders_containing(dirName, pattern): + import os + listOfFile = os.listdir(dirName) + allFiles = list() + # Iterate over all the entries + for entry in listOfFile: + # Create full path + fullPath = os.path.join(dirName, entry) + if os.path.isdir(fullPath) and pattern in fullPath: + allFiles.append(entry) + return allFiles + + +def _get_list_of_files(dirName): + import os + listOfFile = os.listdir(dirName) + allFiles = list() + # Iterate over all the entries + for entry in listOfFile: + # Create full path + fullPath = os.path.join(dirName, entry) + if os.path.isfile(fullPath): + allFiles.append(fullPath) + return allFiles + + +def _get_file(train_base, language, type): + import os.path as path + pth = path.join(train_base, language) + files = _get_list_of_files(pth) + for file in files: + if file.endswith('-{0}.conllu'.format(type)): + return file + return '' + + +def _get_lang_id(file): + parts = file.split('/') + lang_id = parts[-1].split('-')[0] + return lang_id + + +def _process_single(params): + lines = open(params.input_file).readlines() + combined = [] + for line in lines: + line = line.strip() + train_file = _get_file(params.train_base, line, 'train') + dev_file = _get_file(params.train_base, line, 'dev') + if dev_file == '': + dev_file = _get_file(params.train_base, line, 'test') + if train_file != '' and dev_file != '': + lang_id = _get_lang_id(train_file) + if _is_complete_corpus(dev_file): + combined.append([lang_id, train_file, dev_file]) + else: + sys.stdout.write('Removing incomplete languge: "{0}"\n'.format(lang_id)) + + json.dump(combined, open(params.output_file, 'w')) + + +def _process_multi(params): + families = {'Germanic': 10, + 'Afro-Asiatic': 7, + 'Albanian': 1, + 'Greek': 2, + 'Armenian': 1, + 'Mande': 1, + 'Basque': 1, + 'Slavic': 13, + 'Indic': 6, + 'Celtic': 4, + 'Mongolic': 1, + 'Sino-Tibetan': 3, + 'Romance': 8, + 'Uralic': 11, + 'Austronesian': 1, + 'Japanese': 1, + 'Turkic': 3, + 'Korean': 1, + 'Iranian': 2, + 'Latin': 1, + 'Baltic': 2, + 'Tupian': 1, + 'Creole': 1, + 'Dravidian': 1, + 'Tai-Kadai': 1, + 'Austro-Asiatic': 1, + 'Pama-Nyungan': 1, + 'Niger-Congo': 1} + + family2dataset = {k: [] for k in families} + lines = open(params.input_file).readlines() + added_folders = {} + for line in lines: + line = line.strip() + parts = line.split(' ') + ltypes = parts[1].replace(', ', ',').split(',') + family = '' + for ltype in ltypes: + if ltype in family2dataset: + family = ltype + break + if family == '': + print("Unable to process " + line) + else: + language_name = parts[0].split(' ')[0] + all_folders = _get_list_of_folders_containing(params.train_base, language_name) + if len(all_folders) == 0: + print("Unable to find training data for " + language_name) + else: + for folder in all_folders: + if folder in added_folders: + continue + + added_folders[folder] = 1 + train_file = _get_file(params.train_base, folder, 'train') + dev_file = _get_file(params.train_base, folder, 'dev') + if dev_file == '': + dev_file = _get_file(params.train_base, folder, 'test') + if train_file != '' and dev_file != '': + lang_id = _get_lang_id(train_file) + if _is_complete_corpus(dev_file): + family2dataset[family].append([lang_id, train_file, dev_file]) + else: + sys.stdout.write('Removing incomplete languge: "{0}"\n'.format(lang_id)) + for fam in family2dataset: + out_file = '{1}/{0}.json'.format(fam.lower().replace(' ', '_'), params.output_file) + json.dump(family2dataset[fam], open(out_file, 'w'), indent=4) + + # list of all languages + all = [] + for fam in family2dataset: + for entry in family2dataset[fam]: + all.append(entry) + + out_file = '{0}/all.json'.format(params.output_file) + json.dump(all, open(out_file, 'w'), indent=4) + + +def _process_langs(params): + def _extract_lang_and_flavour(name): + name = name[3:] + parts = name.split('-') + lang = parts[0] + flavour = parts[1] + return lang, flavour + + lines = open(params.input_file).readlines() + lang2flavour = {} + for line in lines: + dataset_name = line.strip().split(' ')[-1] + + train_file = _get_file(params.train_base, dataset_name, 'train') + dev_file = _get_file(params.train_base, dataset_name, 'dev') + if dev_file == '': + dev_file = _get_file(params.train_base, dataset_name, 'test') + if train_file != '' and dev_file != '': + lang_id = _get_lang_id(train_file) + if _is_complete_corpus(dev_file): + lang, flavour = _extract_lang_and_flavour(dataset_name) + if lang not in lang2flavour: + lang2flavour[lang] = [(flavour, lang_id, train_file, dev_file)] + else: + lang2flavour[lang].append((flavour, lang_id, train_file, dev_file)) + for lang in lang2flavour: + trainset = [] + for item in lang2flavour[lang]: + # flavour = item[0] + lang_id = item[1] + train_file = item[2] + dev_file = item[3] + trainset.append([lang_id, train_file, dev_file]) + output_file = '{0}/{1}.json'.format(params.output_file, lang.lower()) + f = open(output_file, 'w') + json.dump(trainset, f, indent=4) + f.close() + sys.stdout.write('python3 cube/networks/model.py --batch-size=32 --device=cuda:0 --store=data/lemmatizer-{0} --train=examples/multilanguage/languages/{0}.json\n'.format(lang.lower())) + + +if __name__ == '__main__': + parser = optparse.OptionParser() + parser.add_option('--input-file', action='store', dest='input_file') + parser.add_option('--output-file', action='store', dest='output_file') + parser.add_option('--train-base', action='store', dest='train_base') + parser.add_option('--type', action='store', dest='type', choices=['single', 'family', 'language']) + + (params, _) = parser.parse_args(sys.argv) + + if params.input_file and params.output_file and params.train_base: + if params.type == 'single': + _process_single(params) + elif params.type == 'family': + _process_multi(params) + elif params.type == 'language': + _process_langs(params) + else: + parser.print_help() diff --git a/scripts/train/2.5/family/armenian.yaml b/scripts/train/2.5/family/armenian.yaml new file mode 100644 index 000000000..6813be43b --- /dev/null +++ b/scripts/train/2.5/family/armenian.yaml @@ -0,0 +1,12 @@ +dev_files: + hy_armtdp: corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu +language_codes: +- hy_armtdp +language_map: + armenian: hy_armtdp + hy: hy_armtdp + hy_armtdp: hy_armtdp +test_files: + hy_armtdp: corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu +train_files: + hy_armtdp: corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu diff --git a/scripts/train/2.5/family/austro-asiatic.yaml b/scripts/train/2.5/family/austro-asiatic.yaml new file mode 100644 index 000000000..3341261a2 --- /dev/null +++ b/scripts/train/2.5/family/austro-asiatic.yaml @@ -0,0 +1,12 @@ +dev_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu +language_codes: +- vi_vtb +language_map: + vi: vi_vtb + vi_vtb: vi_vtb + vietnamese: vi_vtb +test_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-test.conllu +train_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu diff --git a/scripts/train/2.5/family/austronesian.yaml b/scripts/train/2.5/family/austronesian.yaml new file mode 100644 index 000000000..bbf08cdbf --- /dev/null +++ b/scripts/train/2.5/family/austronesian.yaml @@ -0,0 +1,15 @@ +dev_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-dev.conllu +language_codes: +- id_pud +- id_gsd +language_map: + id: id_gsd + id_gsd: id_gsd + id_pud: id_pud + indonesian: id_gsd +test_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-test.conllu +train_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-train.conllu + id_pud: corpus/ud-treebanks-v2.5/UD_Indonesian-PUD/id_pud-ud-train.conllu diff --git a/scripts/train/2.5/family/baltic.yaml b/scripts/train/2.5/family/baltic.yaml new file mode 100644 index 000000000..f8662bff5 --- /dev/null +++ b/scripts/train/2.5/family/baltic.yaml @@ -0,0 +1,24 @@ +dev_files: + lt_alksnis: corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu + lt_hse: corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-dev.conllu + lv_lvtb: corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu +language_codes: +- lt_alksnis +- lt_hse +- lv_lvtb +language_map: + latvian: lv_lvtb + lithuanian: lt_alksnis + lt: lt_alksnis + lt_alksnis: lt_alksnis + lt_hse: lt_hse + lv: lv_lvtb + lv_lvtb: lv_lvtb +test_files: + lt_alksnis: corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu + lt_hse: corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-test.conllu + lv_lvtb: corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-test.conllu +train_files: + lt_alksnis: corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu + lt_hse: corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-train.conllu + lv_lvtb: corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-train.conllu diff --git a/scripts/train/2.5/family/basque.yaml b/scripts/train/2.5/family/basque.yaml new file mode 100644 index 000000000..a6b12983c --- /dev/null +++ b/scripts/train/2.5/family/basque.yaml @@ -0,0 +1,12 @@ +dev_files: + eu_bdt: corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-dev.conllu +language_codes: +- eu_bdt +language_map: + basque: eu_bdt + eu: eu_bdt + eu_bdt: eu_bdt +test_files: + eu_bdt: corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-test.conllu +train_files: + eu_bdt: corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-train.conllu diff --git a/scripts/train/2.5/family/celtic.yaml b/scripts/train/2.5/family/celtic.yaml new file mode 100644 index 000000000..c63c96b49 --- /dev/null +++ b/scripts/train/2.5/family/celtic.yaml @@ -0,0 +1,19 @@ +dev_files: + ga_idt: corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-dev.conllu + gd_arcosg: corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu +language_codes: +- gd_arcosg +- ga_idt +language_map: + ga: ga_idt + ga_idt: ga_idt + gd: gd_arcosg + gd_arcosg: gd_arcosg + irish: ga_idt + scottish gaelic: gd_arcosg +test_files: + ga_idt: corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-test.conllu + gd_arcosg: corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu +train_files: + ga_idt: corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-train.conllu + gd_arcosg: corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu diff --git a/scripts/train/2.5/family/dravidian.yaml b/scripts/train/2.5/family/dravidian.yaml new file mode 100644 index 000000000..ce3241d54 --- /dev/null +++ b/scripts/train/2.5/family/dravidian.yaml @@ -0,0 +1,19 @@ +dev_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-dev.conllu + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-dev.conllu +language_codes: +- ta_ttb +- te_mtg +language_map: + ta: ta_ttb + ta_ttb: ta_ttb + tamil: ta_ttb + te: te_mtg + te_mtg: te_mtg + telugu: te_mtg +test_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-test.conllu + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-test.conllu +train_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-train.conllu + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-train.conllu diff --git a/scripts/train/2.5/family/egyptian.yaml b/scripts/train/2.5/family/egyptian.yaml new file mode 100644 index 000000000..770e9eaa0 --- /dev/null +++ b/scripts/train/2.5/family/egyptian.yaml @@ -0,0 +1,12 @@ +dev_files: + cop_scriptorium: corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu +language_codes: +- cop_scriptorium +language_map: + cop: cop_scriptorium + cop_scriptorium: cop_scriptorium + coptic: cop_scriptorium +test_files: + cop_scriptorium: corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu +train_files: + cop_scriptorium: corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu diff --git a/scripts/train/2.5/family/finnic.yaml b/scripts/train/2.5/family/finnic.yaml new file mode 100644 index 000000000..c2054e108 --- /dev/null +++ b/scripts/train/2.5/family/finnic.yaml @@ -0,0 +1,39 @@ +dev_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-dev.conllu + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-dev.conllu + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-dev.conllu + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +language_codes: +- fi_pud +- et_ewt +- fi_tdt +- fi_ftb +- olo_kkpp +- et_edt +language_map: + estonian: et_edt + et: et_edt + et_edt: et_edt + et_ewt: et_ewt + fi: fi_tdt + fi_ftb: fi_ftb + fi_pud: fi_pud + fi_tdt: fi_tdt + finnish: fi_tdt + livvi: olo_kkpp + olo: olo_kkpp + olo_kkpp: olo_kkpp +test_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-test.conllu + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-test.conllu + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-test.conllu + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +train_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-train.conllu + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-train.conllu + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-train.conllu + fi_pud: corpus/ud-treebanks-v2.5/UD_Finnish-PUD/fi_pud-ud-train.conllu + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-train.conllu + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu diff --git a/scripts/train/2.5/family/germanic.yaml b/scripts/train/2.5/family/germanic.yaml new file mode 100644 index 000000000..a4d380f58 --- /dev/null +++ b/scripts/train/2.5/family/germanic.yaml @@ -0,0 +1,121 @@ +dev_files: + af_afribooms: corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu + da_ddt: corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-dev.conllu + de_gsd: corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-dev.conllu + de_hdt: corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-dev.conllu + en_esl: corpus/ud-treebanks-v2.5/UD_English-ESL/en_esl-ud-dev.conllu + en_ewt: corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-dev.conllu + en_gum: corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-dev.conllu + en_lines: corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-dev.conllu + en_partut: corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-dev.conllu + got_proiel: corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-dev.conllu + nl_alpino: corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-dev.conllu + nl_lassysmall: corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu + no_bokmaal: corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu + no_nynorsk: corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu + no_nynorsklia: corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu + sv_lines: corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-dev.conllu + sv_talbanken: corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu +language_codes: +- en_esl +- de_gsd +- en_pud +- nl_alpino +- da_ddt +- is_icepahc +- sv_pud +- en_gum +- got_proiel +- en_lines +- is_pud +- no_nynorsklia +- no_bokmaal +- sv_lines +- en_ewt +- no_nynorsk +- en_partut +- sv_talbanken +- nl_lassysmall +- de_hdt +- de_pud +- af_afribooms +language_map: + af: af_afribooms + af_afribooms: af_afribooms + afrikaans: af_afribooms + da: da_ddt + da_ddt: da_ddt + danish: da_ddt + de: de_hdt + de_gsd: de_gsd + de_hdt: de_hdt + de_pud: de_pud + dutch: nl_alpino + en: en_ewt + en_esl: en_esl + en_ewt: en_ewt + en_gum: en_gum + en_lines: en_lines + en_partut: en_partut + en_pud: en_pud + english: en_ewt + german: de_hdt + got: got_proiel + got_proiel: got_proiel + gothic: got_proiel + icelandic: is_icepahc + is: is_icepahc + is_pud: is_pud + nl: nl_alpino + nl_alpino: nl_alpino + nl_lassysmall: nl_lassysmall + 'no': no_bokmaal + no_bokmaal: no_bokmaal + no_nynorsk: no_nynorsk + no_nynorsklia: no_nynorsklia + norwegian: no_bokmaal + sv: sv_talbanken + sv_lines: sv_lines + sv_pud: sv_pud + sv_talbanken: sv_talbanken + swedish: sv_talbanken +test_files: + af_afribooms: corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu + da_ddt: corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-test.conllu + de_gsd: corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-test.conllu + de_hdt: corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-test.conllu + en_esl: corpus/ud-treebanks-v2.5/UD_English-ESL/en_esl-ud-test.conllu + en_ewt: corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-test.conllu + en_gum: corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-test.conllu + en_lines: corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-test.conllu + en_partut: corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-test.conllu + got_proiel: corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-test.conllu + nl_alpino: corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-test.conllu + nl_lassysmall: corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu + no_bokmaal: corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu + no_nynorsk: corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu + no_nynorsklia: corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu + sv_lines: corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-test.conllu + sv_talbanken: corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu +train_files: + af_afribooms: corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu + da_ddt: corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-train.conllu + de_gsd: corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-train.conllu + de_hdt: corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-train.conllu + de_pud: corpus/ud-treebanks-v2.5/UD_German-PUD/de_pud-ud-train.conllu + en_esl: corpus/ud-treebanks-v2.5/UD_English-ESL/en_esl-ud-train.conllu + en_ewt: corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-train.conllu + en_gum: corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-train.conllu + en_lines: corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-train.conllu + en_partut: corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-train.conllu + en_pud: corpus/ud-treebanks-v2.5/UD_English-PUD/en_pud-ud-train.conllu + got_proiel: corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-train.conllu + is_pud: corpus/ud-treebanks-v2.5/UD_Icelandic-PUD/is_pud-ud-train.conllu + nl_alpino: corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-train.conllu + nl_lassysmall: corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu + no_bokmaal: corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu + no_nynorsk: corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu + no_nynorsklia: corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu + sv_lines: corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-train.conllu + sv_pud: corpus/ud-treebanks-v2.5/UD_Swedish-PUD/sv_pud-ud-train.conllu + sv_talbanken: corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu diff --git a/scripts/train/2.5/family/greek.yaml b/scripts/train/2.5/family/greek.yaml new file mode 100644 index 000000000..10d4bedb1 --- /dev/null +++ b/scripts/train/2.5/family/greek.yaml @@ -0,0 +1,24 @@ +dev_files: + el_gdt: corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-dev.conllu + grc_perseus: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu + grc_proiel: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu +language_codes: +- grc_perseus +- el_gdt +- grc_proiel +language_map: + ancient greek: grc_proiel + el: el_gdt + el_gdt: el_gdt + grc: grc_proiel + grc_perseus: grc_perseus + grc_proiel: grc_proiel + greek: el_gdt +test_files: + el_gdt: corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-test.conllu + grc_perseus: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu + grc_proiel: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu +train_files: + el_gdt: corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-train.conllu + grc_perseus: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu + grc_proiel: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu diff --git a/scripts/train/2.5/family/indic.yaml b/scripts/train/2.5/family/indic.yaml new file mode 100644 index 000000000..388aed174 --- /dev/null +++ b/scripts/train/2.5/family/indic.yaml @@ -0,0 +1,29 @@ +dev_files: + hi_hdtb: corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu + mr_ufal: corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-dev.conllu + ur_udtb: corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-dev.conllu +language_codes: +- mr_ufal +- ur_udtb +- hi_pud +- hi_hdtb +language_map: + hi: hi_hdtb + hi_hdtb: hi_hdtb + hi_pud: hi_pud + hindi: hi_hdtb + marathi: mr_ufal + mr: mr_ufal + mr_ufal: mr_ufal + ur: ur_udtb + ur_udtb: ur_udtb + urdu: ur_udtb +test_files: + hi_hdtb: corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-test.conllu + mr_ufal: corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-test.conllu + ur_udtb: corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-test.conllu +train_files: + hi_hdtb: corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu + hi_pud: corpus/ud-treebanks-v2.5/UD_Hindi-PUD/hi_pud-ud-train.conllu + mr_ufal: corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-train.conllu + ur_udtb: corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-train.conllu diff --git a/scripts/train/2.5/family/iranian.yaml b/scripts/train/2.5/family/iranian.yaml new file mode 100644 index 000000000..b49fe1857 --- /dev/null +++ b/scripts/train/2.5/family/iranian.yaml @@ -0,0 +1,19 @@ +dev_files: + fa_seraji: corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-dev.conllu + kmr_mg: corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +language_codes: +- fa_seraji +- kmr_mg +language_map: + fa: fa_seraji + fa_seraji: fa_seraji + kmr: kmr_mg + kmr_mg: kmr_mg + kurmanji: kmr_mg + persian: fa_seraji +test_files: + fa_seraji: corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-test.conllu + kmr_mg: corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +train_files: + fa_seraji: corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-train.conllu + kmr_mg: corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-train.conllu diff --git a/scripts/train/2.5/family/japanese.yaml b/scripts/train/2.5/family/japanese.yaml new file mode 100644 index 000000000..15d1579d4 --- /dev/null +++ b/scripts/train/2.5/family/japanese.yaml @@ -0,0 +1,20 @@ +dev_files: + ja_bccwj: corpus/ud-treebanks-v2.5/UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu + ja_gsd: corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-dev.conllu +language_codes: +- ja_gsd +- ja_bccwj +- ja_pud +language_map: + ja: ja_gsd + ja_bccwj: ja_bccwj + ja_gsd: ja_gsd + ja_pud: ja_pud + japanese: ja_gsd +test_files: + ja_bccwj: corpus/ud-treebanks-v2.5/UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu + ja_gsd: corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-test.conllu +train_files: + ja_bccwj: corpus/ud-treebanks-v2.5/UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu + ja_gsd: corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-train.conllu + ja_pud: corpus/ud-treebanks-v2.5/UD_Japanese-PUD/ja_pud-ud-train.conllu diff --git a/scripts/train/2.5/family/korean.yaml b/scripts/train/2.5/family/korean.yaml new file mode 100644 index 000000000..b8194f554 --- /dev/null +++ b/scripts/train/2.5/family/korean.yaml @@ -0,0 +1,20 @@ +dev_files: + ko_gsd: corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-dev.conllu + ko_kaist: corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-dev.conllu +language_codes: +- ko_pud +- ko_kaist +- ko_gsd +language_map: + ko: ko_gsd + ko_gsd: ko_gsd + ko_kaist: ko_kaist + ko_pud: ko_pud + korean: ko_gsd +test_files: + ko_gsd: corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-test.conllu + ko_kaist: corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-test.conllu +train_files: + ko_gsd: corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-train.conllu + ko_kaist: corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-train.conllu + ko_pud: corpus/ud-treebanks-v2.5/UD_Korean-PUD/ko_pud-ud-train.conllu diff --git a/scripts/train/2.5/family/latin.yaml b/scripts/train/2.5/family/latin.yaml new file mode 100644 index 000000000..4b812d620 --- /dev/null +++ b/scripts/train/2.5/family/latin.yaml @@ -0,0 +1,22 @@ +dev_files: + la_ittb: corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-dev.conllu + la_perseus: corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-test.conllu + la_proiel: corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-dev.conllu +language_codes: +- la_ittb +- la_perseus +- la_proiel +language_map: + la: la_ittb + la_ittb: la_ittb + la_perseus: la_perseus + la_proiel: la_proiel + latin: la_ittb +test_files: + la_ittb: corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-test.conllu + la_perseus: corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-test.conllu + la_proiel: corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-test.conllu +train_files: + la_ittb: corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-train.conllu + la_perseus: corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-train.conllu + la_proiel: corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-train.conllu diff --git a/scripts/train/2.5/family/malayo-sumbawan.yaml b/scripts/train/2.5/family/malayo-sumbawan.yaml new file mode 100644 index 000000000..bbf08cdbf --- /dev/null +++ b/scripts/train/2.5/family/malayo-sumbawan.yaml @@ -0,0 +1,15 @@ +dev_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-dev.conllu +language_codes: +- id_pud +- id_gsd +language_map: + id: id_gsd + id_gsd: id_gsd + id_pud: id_pud + indonesian: id_gsd +test_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-test.conllu +train_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-train.conllu + id_pud: corpus/ud-treebanks-v2.5/UD_Indonesian-PUD/id_pud-ud-train.conllu diff --git a/scripts/train/2.5/family/mongolic.yaml b/scripts/train/2.5/family/mongolic.yaml new file mode 100644 index 000000000..1e68dc47d --- /dev/null +++ b/scripts/train/2.5/family/mongolic.yaml @@ -0,0 +1,12 @@ +dev_files: + bxr_bdt: corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +language_codes: +- bxr_bdt +language_map: + buryat: bxr_bdt + bxr: bxr_bdt + bxr_bdt: bxr_bdt +test_files: + bxr_bdt: corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +train_files: + bxr_bdt: corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-train.conllu diff --git a/scripts/train/2.5/family/niger-congo.yaml b/scripts/train/2.5/family/niger-congo.yaml new file mode 100644 index 000000000..17ca88a87 --- /dev/null +++ b/scripts/train/2.5/family/niger-congo.yaml @@ -0,0 +1,12 @@ +dev_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-dev.conllu +language_codes: +- wo_wtb +language_map: + wo: wo_wtb + wo_wtb: wo_wtb + wolof: wo_wtb +test_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-test.conllu +train_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-train.conllu diff --git a/scripts/train/2.5/family/northern atlantic.yaml b/scripts/train/2.5/family/northern atlantic.yaml new file mode 100644 index 000000000..17ca88a87 --- /dev/null +++ b/scripts/train/2.5/family/northern atlantic.yaml @@ -0,0 +1,12 @@ +dev_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-dev.conllu +language_codes: +- wo_wtb +language_map: + wo: wo_wtb + wo_wtb: wo_wtb + wolof: wo_wtb +test_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-test.conllu +train_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-train.conllu diff --git a/scripts/train/2.5/family/northwestern.yaml b/scripts/train/2.5/family/northwestern.yaml new file mode 100644 index 000000000..9ac4c36b0 --- /dev/null +++ b/scripts/train/2.5/family/northwestern.yaml @@ -0,0 +1,12 @@ +dev_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +language_codes: +- kk_ktb +language_map: + kazakh: kk_ktb + kk: kk_ktb + kk_ktb: kk_ktb +test_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +train_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-train.conllu diff --git a/scripts/train/2.5/family/romance.yaml b/scripts/train/2.5/family/romance.yaml new file mode 100644 index 000000000..6338bb990 --- /dev/null +++ b/scripts/train/2.5/family/romance.yaml @@ -0,0 +1,133 @@ +dev_files: + ca_ancora: corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-dev.conllu + es_ancora: corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-dev.conllu + es_gsd: corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-dev.conllu + fr_ftb: corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-dev.conllu + fr_gsd: corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-dev.conllu + fr_partut: corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-dev.conllu + fr_sequoia: corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-dev.conllu + fr_spoken: corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-dev.conllu + fro_srcmf: corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu + gl_ctg: corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-dev.conllu + gl_treegal: corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-test.conllu + it_isdt: corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-dev.conllu + it_partut: corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-dev.conllu + it_postwita: corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu + it_twittiro: corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu + it_vit: corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-dev.conllu + pt_bosque: corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu + pt_gsd: corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-dev.conllu + ro_nonstandard: corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu + ro_rrt: corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-dev.conllu +language_codes: +- fr_sequoia +- it_twittiro +- ro_rrt +- gl_treegal +- fr_pud +- pt_bosque +- es_pud +- it_postwita +- it_vit +- it_isdt +- pt_pud +- fr_ftb +- it_partut +- ca_ancora +- fr_partut +- es_gsd +- fro_srcmf +- pt_gsd +- fr_spoken +- it_pud +- gl_ctg +- es_ancora +- fr_gsd +- ro_nonstandard +language_map: + ca: ca_ancora + ca_ancora: ca_ancora + catalan: ca_ancora + es: es_gsd + es_ancora: es_ancora + es_gsd: es_gsd + es_pud: es_pud + fr: fr_gsd + fr_ftb: fr_ftb + fr_gsd: fr_gsd + fr_partut: fr_partut + fr_pud: fr_pud + fr_sequoia: fr_sequoia + fr_spoken: fr_spoken + french: fr_gsd + fro: fro_srcmf + fro_srcmf: fro_srcmf + galician: gl_ctg + gl: gl_ctg + gl_ctg: gl_ctg + gl_treegal: gl_treegal + it: it_isdt + it_isdt: it_isdt + it_partut: it_partut + it_postwita: it_postwita + it_pud: it_pud + it_twittiro: it_twittiro + it_vit: it_vit + italian: it_isdt + old french: fro_srcmf + portuguese: pt_bosque + pt: pt_bosque + pt_bosque: pt_bosque + pt_gsd: pt_gsd + pt_pud: pt_pud + ro: ro_rrt + ro_nonstandard: ro_nonstandard + ro_rrt: ro_rrt + romanian: ro_rrt + spanish: es_gsd +test_files: + ca_ancora: corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-test.conllu + es_ancora: corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-test.conllu + es_gsd: corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-test.conllu + fr_ftb: corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-test.conllu + fr_gsd: corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-test.conllu + fr_partut: corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-test.conllu + fr_sequoia: corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-test.conllu + fr_spoken: corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-test.conllu + fro_srcmf: corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu + gl_ctg: corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-test.conllu + gl_treegal: corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-test.conllu + it_isdt: corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-test.conllu + it_partut: corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-test.conllu + it_postwita: corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-test.conllu + it_twittiro: corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu + it_vit: corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-test.conllu + pt_bosque: corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-test.conllu + pt_gsd: corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-test.conllu + ro_nonstandard: corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu + ro_rrt: corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-test.conllu +train_files: + ca_ancora: corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-train.conllu + es_ancora: corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-train.conllu + es_gsd: corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-train.conllu + es_pud: corpus/ud-treebanks-v2.5/UD_Spanish-PUD/es_pud-ud-train.conllu + fr_ftb: corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-train.conllu + fr_gsd: corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-train.conllu + fr_partut: corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-train.conllu + fr_pud: corpus/ud-treebanks-v2.5/UD_French-PUD/fr_pud-ud-train.conllu + fr_sequoia: corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-train.conllu + fr_spoken: corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-train.conllu + fro_srcmf: corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu + gl_ctg: corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-train.conllu + gl_treegal: corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-train.conllu + it_isdt: corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-train.conllu + it_partut: corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-train.conllu + it_postwita: corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-train.conllu + it_pud: corpus/ud-treebanks-v2.5/UD_Italian-PUD/it_pud-ud-train.conllu + it_twittiro: corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu + it_vit: corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-train.conllu + pt_bosque: corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-train.conllu + pt_gsd: corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-train.conllu + pt_pud: corpus/ud-treebanks-v2.5/UD_Portuguese-PUD/pt_pud-ud-train.conllu + ro_nonstandard: corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu + ro_rrt: corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-train.conllu diff --git a/scripts/train/2.5/family/sami.yaml b/scripts/train/2.5/family/sami.yaml new file mode 100644 index 000000000..448972339 --- /dev/null +++ b/scripts/train/2.5/family/sami.yaml @@ -0,0 +1,12 @@ +dev_files: + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu +language_codes: +- sme_giella +language_map: + north sami: sme_giella + sme: sme_giella + sme_giella: sme_giella +test_files: + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu +train_files: + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-train.conllu diff --git a/scripts/train/2.5/family/semitic.yaml b/scripts/train/2.5/family/semitic.yaml new file mode 100644 index 000000000..1df2ba7d6 --- /dev/null +++ b/scripts/train/2.5/family/semitic.yaml @@ -0,0 +1,34 @@ +dev_files: + ar_nyuad: corpus/ud-treebanks-v2.5/UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu + ar_padt: corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-dev.conllu + he_htb: corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-dev.conllu + mt_mudt: corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu +language_codes: +- mt_mudt +- ar_pud +- ar_nyuad +- ar_padt +- he_htb +language_map: + ar: ar_padt + ar_nyuad: ar_nyuad + ar_padt: ar_padt + ar_pud: ar_pud + arabic: ar_padt + he: he_htb + he_htb: he_htb + hebrew: he_htb + maltese: mt_mudt + mt: mt_mudt + mt_mudt: mt_mudt +test_files: + ar_nyuad: corpus/ud-treebanks-v2.5/UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu + ar_padt: corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-test.conllu + he_htb: corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-test.conllu + mt_mudt: corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-test.conllu +train_files: + ar_nyuad: corpus/ud-treebanks-v2.5/UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu + ar_padt: corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-train.conllu + ar_pud: corpus/ud-treebanks-v2.5/UD_Arabic-PUD/ar_pud-ud-train.conllu + he_htb: corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-train.conllu + mt_mudt: corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-train.conllu diff --git a/scripts/train/2.5/family/sign language.yaml b/scripts/train/2.5/family/sign language.yaml new file mode 100644 index 000000000..f1f89d077 --- /dev/null +++ b/scripts/train/2.5/family/sign language.yaml @@ -0,0 +1,12 @@ +dev_files: + swl_sslc: corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu +language_codes: +- swl_sslc +language_map: + swedish sign language: swl_sslc + swl: swl_sslc + swl_sslc: swl_sslc +test_files: + swl_sslc: corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu +train_files: + swl_sslc: corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu diff --git a/scripts/train/2.5/family/sino-tibetan.yaml b/scripts/train/2.5/family/sino-tibetan.yaml new file mode 100644 index 000000000..92f02af65 --- /dev/null +++ b/scripts/train/2.5/family/sino-tibetan.yaml @@ -0,0 +1,27 @@ +dev_files: + lzh_kyoto: corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu + zh_gsd: corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-dev.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu +language_codes: +- zh_gsdsimp +- zh_pud +- zh_gsd +- lzh_kyoto +language_map: + chinese: zh_gsdsimp + classical chinese: lzh_kyoto + lzh: lzh_kyoto + lzh_kyoto: lzh_kyoto + zh: zh_gsdsimp + zh_gsd: zh_gsd + zh_gsdsimp: zh_gsdsimp + zh_pud: zh_pud +test_files: + lzh_kyoto: corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu + zh_gsd: corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-test.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu +train_files: + lzh_kyoto: corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu + zh_gsd: corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-train.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu + zh_pud: corpus/ud-treebanks-v2.5/UD_Chinese-PUD/zh_pud-ud-train.conllu diff --git a/scripts/train/2.5/family/slavic.yaml b/scripts/train/2.5/family/slavic.yaml new file mode 100644 index 000000000..8dfd19397 --- /dev/null +++ b/scripts/train/2.5/family/slavic.yaml @@ -0,0 +1,133 @@ +dev_files: + be_hse: corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-dev.conllu + bg_btb: corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-dev.conllu + cs_cac: corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-dev.conllu + cs_cltt: corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-dev.conllu + cs_fictree: corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-dev.conllu + cs_pdt: corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-dev.conllu + cu_proiel: corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu + hr_set: corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-dev.conllu + hsb_ufal: corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu + pl_lfg: corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-dev.conllu + pl_pdb: corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-dev.conllu + ru_gsd: corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-dev.conllu + ru_syntagrus: corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu + ru_taiga: corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-dev.conllu + sk_snk: corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-dev.conllu + sl_ssj: corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu + sl_sst: corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-test.conllu + sr_set: corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-dev.conllu + uk_iu: corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-dev.conllu +language_codes: +- cs_cltt +- sr_set +- ru_syntagrus +- cu_proiel +- sk_snk +- hsb_ufal +- cs_fictree +- ru_gsd +- ru_pud +- pl_pdb +- uk_iu +- sl_ssj +- hr_set +- be_hse +- cs_pdt +- ru_taiga +- cs_pud +- cs_cac +- bg_btb +- pl_pud +- sl_sst +- pl_lfg +language_map: + be: be_hse + be_hse: be_hse + belarusian: be_hse + bg: bg_btb + bg_btb: bg_btb + bulgarian: bg_btb + croatian: hr_set + cs: cs_pdt + cs_cac: cs_cac + cs_cltt: cs_cltt + cs_fictree: cs_fictree + cs_pdt: cs_pdt + cs_pud: cs_pud + cu: cu_proiel + cu_proiel: cu_proiel + czech: cs_pdt + hr: hr_set + hr_set: hr_set + hsb: hsb_ufal + hsb_ufal: hsb_ufal + old church slavonic: cu_proiel + pl: pl_pdb + pl_lfg: pl_lfg + pl_pdb: pl_pdb + pl_pud: pl_pud + polish: pl_pdb + ru: ru_syntagrus + ru_gsd: ru_gsd + ru_pud: ru_pud + ru_syntagrus: ru_syntagrus + ru_taiga: ru_taiga + russian: ru_syntagrus + serbian: sr_set + sk: sk_snk + sk_snk: sk_snk + sl: sl_ssj + sl_ssj: sl_ssj + sl_sst: sl_sst + slovak: sk_snk + slovenian: sl_ssj + sr: sr_set + sr_set: sr_set + uk: uk_iu + uk_iu: uk_iu + ukrainian: uk_iu + upper sorbian: hsb_ufal +test_files: + be_hse: corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-test.conllu + bg_btb: corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-test.conllu + cs_cac: corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-test.conllu + cs_cltt: corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-test.conllu + cs_fictree: corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-test.conllu + cs_pdt: corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-test.conllu + cu_proiel: corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu + hr_set: corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-test.conllu + hsb_ufal: corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu + pl_lfg: corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-test.conllu + pl_pdb: corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-test.conllu + ru_gsd: corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-test.conllu + ru_syntagrus: corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu + ru_taiga: corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-test.conllu + sk_snk: corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-test.conllu + sl_ssj: corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-test.conllu + sl_sst: corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-test.conllu + sr_set: corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-test.conllu + uk_iu: corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-test.conllu +train_files: + be_hse: corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-train.conllu + bg_btb: corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-train.conllu + cs_cac: corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-train.conllu + cs_cltt: corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-train.conllu + cs_fictree: corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-train.conllu + cs_pdt: corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-train.conllu + cs_pud: corpus/ud-treebanks-v2.5/UD_Czech-PUD/cs_pud-ud-train.conllu + cu_proiel: corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu + hr_set: corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-train.conllu + hsb_ufal: corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu + pl_lfg: corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-train.conllu + pl_pdb: corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-train.conllu + pl_pud: corpus/ud-treebanks-v2.5/UD_Polish-PUD/pl_pud-ud-train.conllu + ru_gsd: corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-train.conllu + ru_pud: corpus/ud-treebanks-v2.5/UD_Russian-PUD/ru_pud-ud-train.conllu + ru_syntagrus: corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu + ru_taiga: corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-train.conllu + sk_snk: corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-train.conllu + sl_ssj: corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-train.conllu + sl_sst: corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-train.conllu + sr_set: corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-train.conllu + uk_iu: corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-train.conllu diff --git a/scripts/train/2.5/family/south central.yaml b/scripts/train/2.5/family/south central.yaml new file mode 100644 index 000000000..0bf9087a4 --- /dev/null +++ b/scripts/train/2.5/family/south central.yaml @@ -0,0 +1,12 @@ +dev_files: + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-dev.conllu +language_codes: +- te_mtg +language_map: + te: te_mtg + te_mtg: te_mtg + telugu: te_mtg +test_files: + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-test.conllu +train_files: + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-train.conllu diff --git a/scripts/train/2.5/family/southeastern.yaml b/scripts/train/2.5/family/southeastern.yaml new file mode 100644 index 000000000..0f88152bd --- /dev/null +++ b/scripts/train/2.5/family/southeastern.yaml @@ -0,0 +1,12 @@ +dev_files: + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-dev.conllu +language_codes: +- ug_udt +language_map: + ug: ug_udt + ug_udt: ug_udt + uyghur: ug_udt +test_files: + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-test.conllu +train_files: + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-train.conllu diff --git a/scripts/train/2.5/family/southern.yaml b/scripts/train/2.5/family/southern.yaml new file mode 100644 index 000000000..15b97eeb0 --- /dev/null +++ b/scripts/train/2.5/family/southern.yaml @@ -0,0 +1,12 @@ +dev_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-dev.conllu +language_codes: +- ta_ttb +language_map: + ta: ta_ttb + ta_ttb: ta_ttb + tamil: ta_ttb +test_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-test.conllu +train_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-train.conllu diff --git a/scripts/train/2.5/family/southwestern.yaml b/scripts/train/2.5/family/southwestern.yaml new file mode 100644 index 000000000..6a7ca6e4a --- /dev/null +++ b/scripts/train/2.5/family/southwestern.yaml @@ -0,0 +1,15 @@ +dev_files: + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-dev.conllu +language_codes: +- tr_pud +- tr_imst +language_map: + tr: tr_imst + tr_imst: tr_imst + tr_pud: tr_pud + turkish: tr_imst +test_files: + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-test.conllu +train_files: + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-train.conllu + tr_pud: corpus/ud-treebanks-v2.5/UD_Turkish-PUD/tr_pud-ud-train.conllu diff --git a/scripts/train/2.5/family/turkic.yaml b/scripts/train/2.5/family/turkic.yaml new file mode 100644 index 000000000..952307510 --- /dev/null +++ b/scripts/train/2.5/family/turkic.yaml @@ -0,0 +1,29 @@ +dev_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-dev.conllu + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-dev.conllu +language_codes: +- tr_pud +- kk_ktb +- tr_imst +- ug_udt +language_map: + kazakh: kk_ktb + kk: kk_ktb + kk_ktb: kk_ktb + tr: tr_imst + tr_imst: tr_imst + tr_pud: tr_pud + turkish: tr_imst + ug: ug_udt + ug_udt: ug_udt + uyghur: ug_udt +test_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-test.conllu + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-test.conllu +train_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-train.conllu + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-train.conllu + tr_pud: corpus/ud-treebanks-v2.5/UD_Turkish-PUD/tr_pud-ud-train.conllu + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-train.conllu diff --git a/scripts/train/2.5/family/ugric.yaml b/scripts/train/2.5/family/ugric.yaml new file mode 100644 index 000000000..0d698da78 --- /dev/null +++ b/scripts/train/2.5/family/ugric.yaml @@ -0,0 +1,12 @@ +dev_files: + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu +language_codes: +- hu_szeged +language_map: + hu: hu_szeged + hu_szeged: hu_szeged + hungarian: hu_szeged +test_files: + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-test.conllu +train_files: + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu diff --git a/scripts/train/2.5/family/uralic.yaml b/scripts/train/2.5/family/uralic.yaml new file mode 100644 index 000000000..fec634712 --- /dev/null +++ b/scripts/train/2.5/family/uralic.yaml @@ -0,0 +1,53 @@ +dev_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-dev.conllu + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-dev.conllu + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-dev.conllu + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu +language_codes: +- fi_pud +- et_ewt +- hu_szeged +- fi_tdt +- fi_ftb +- olo_kkpp +- et_edt +- sme_giella +language_map: + estonian: et_edt + et: et_edt + et_edt: et_edt + et_ewt: et_ewt + fi: fi_tdt + fi_ftb: fi_ftb + fi_pud: fi_pud + fi_tdt: fi_tdt + finnish: fi_tdt + hu: hu_szeged + hu_szeged: hu_szeged + hungarian: hu_szeged + livvi: olo_kkpp + north sami: sme_giella + olo: olo_kkpp + olo_kkpp: olo_kkpp + sme: sme_giella + sme_giella: sme_giella +test_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-test.conllu + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-test.conllu + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-test.conllu + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-test.conllu + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu +train_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-train.conllu + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-train.conllu + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-train.conllu + fi_pud: corpus/ud-treebanks-v2.5/UD_Finnish-PUD/fi_pud-ud-train.conllu + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-train.conllu + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-train.conllu diff --git a/scripts/train/2.5/family/viet-muong.yaml b/scripts/train/2.5/family/viet-muong.yaml new file mode 100644 index 000000000..3341261a2 --- /dev/null +++ b/scripts/train/2.5/family/viet-muong.yaml @@ -0,0 +1,12 @@ +dev_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu +language_codes: +- vi_vtb +language_map: + vi: vi_vtb + vi_vtb: vi_vtb + vietnamese: vi_vtb +test_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-test.conllu +train_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu diff --git a/scripts/train/2.5/language/af.yaml b/scripts/train/2.5/language/af.yaml new file mode 100644 index 000000000..24fe7f1e0 --- /dev/null +++ b/scripts/train/2.5/language/af.yaml @@ -0,0 +1,12 @@ +dev_files: + af_afribooms: corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu +language_codes: +- af_afribooms +language_map: + af: af_afribooms + af_afribooms: af_afribooms + afrikaans: af_afribooms +test_files: + af_afribooms: corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu +train_files: + af_afribooms: corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu diff --git a/scripts/train/2.5/language/ar.yaml b/scripts/train/2.5/language/ar.yaml new file mode 100644 index 000000000..af90d9b96 --- /dev/null +++ b/scripts/train/2.5/language/ar.yaml @@ -0,0 +1,20 @@ +dev_files: + ar_nyuad: corpus/ud-treebanks-v2.5/UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu + ar_padt: corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-dev.conllu +language_codes: +- ar_pud +- ar_nyuad +- ar_padt +language_map: + ar: ar_padt + ar_nyuad: ar_nyuad + ar_padt: ar_padt + ar_pud: ar_pud + arabic: ar_padt +test_files: + ar_nyuad: corpus/ud-treebanks-v2.5/UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu + ar_padt: corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-test.conllu +train_files: + ar_nyuad: corpus/ud-treebanks-v2.5/UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu + ar_padt: corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-train.conllu + ar_pud: corpus/ud-treebanks-v2.5/UD_Arabic-PUD/ar_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/be.yaml b/scripts/train/2.5/language/be.yaml new file mode 100644 index 000000000..e426224bb --- /dev/null +++ b/scripts/train/2.5/language/be.yaml @@ -0,0 +1,12 @@ +dev_files: + be_hse: corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-dev.conllu +language_codes: +- be_hse +language_map: + be: be_hse + be_hse: be_hse + belarusian: be_hse +test_files: + be_hse: corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-test.conllu +train_files: + be_hse: corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-train.conllu diff --git a/scripts/train/2.5/language/bg.yaml b/scripts/train/2.5/language/bg.yaml new file mode 100644 index 000000000..3cd87a3b0 --- /dev/null +++ b/scripts/train/2.5/language/bg.yaml @@ -0,0 +1,12 @@ +dev_files: + bg_btb: corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-dev.conllu +language_codes: +- bg_btb +language_map: + bg: bg_btb + bg_btb: bg_btb + bulgarian: bg_btb +test_files: + bg_btb: corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-test.conllu +train_files: + bg_btb: corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-train.conllu diff --git a/scripts/train/2.5/language/bxr.yaml b/scripts/train/2.5/language/bxr.yaml new file mode 100644 index 000000000..1e68dc47d --- /dev/null +++ b/scripts/train/2.5/language/bxr.yaml @@ -0,0 +1,12 @@ +dev_files: + bxr_bdt: corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +language_codes: +- bxr_bdt +language_map: + buryat: bxr_bdt + bxr: bxr_bdt + bxr_bdt: bxr_bdt +test_files: + bxr_bdt: corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +train_files: + bxr_bdt: corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-train.conllu diff --git a/scripts/train/2.5/language/ca.yaml b/scripts/train/2.5/language/ca.yaml new file mode 100644 index 000000000..63ca25d73 --- /dev/null +++ b/scripts/train/2.5/language/ca.yaml @@ -0,0 +1,12 @@ +dev_files: + ca_ancora: corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-dev.conllu +language_codes: +- ca_ancora +language_map: + ca: ca_ancora + ca_ancora: ca_ancora + catalan: ca_ancora +test_files: + ca_ancora: corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-test.conllu +train_files: + ca_ancora: corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-train.conllu diff --git a/scripts/train/2.5/language/cop.yaml b/scripts/train/2.5/language/cop.yaml new file mode 100644 index 000000000..770e9eaa0 --- /dev/null +++ b/scripts/train/2.5/language/cop.yaml @@ -0,0 +1,12 @@ +dev_files: + cop_scriptorium: corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu +language_codes: +- cop_scriptorium +language_map: + cop: cop_scriptorium + cop_scriptorium: cop_scriptorium + coptic: cop_scriptorium +test_files: + cop_scriptorium: corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu +train_files: + cop_scriptorium: corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu diff --git a/scripts/train/2.5/language/cs.yaml b/scripts/train/2.5/language/cs.yaml new file mode 100644 index 000000000..70a290f41 --- /dev/null +++ b/scripts/train/2.5/language/cs.yaml @@ -0,0 +1,30 @@ +dev_files: + cs_cac: corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-dev.conllu + cs_cltt: corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-dev.conllu + cs_fictree: corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-dev.conllu + cs_pdt: corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-dev.conllu +language_codes: +- cs_cltt +- cs_cac +- cs_fictree +- cs_pud +- cs_pdt +language_map: + cs: cs_pdt + cs_cac: cs_cac + cs_cltt: cs_cltt + cs_fictree: cs_fictree + cs_pdt: cs_pdt + cs_pud: cs_pud + czech: cs_pdt +test_files: + cs_cac: corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-test.conllu + cs_cltt: corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-test.conllu + cs_fictree: corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-test.conllu + cs_pdt: corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-test.conllu +train_files: + cs_cac: corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-train.conllu + cs_cltt: corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-train.conllu + cs_fictree: corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-train.conllu + cs_pdt: corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-train.conllu + cs_pud: corpus/ud-treebanks-v2.5/UD_Czech-PUD/cs_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/cu.yaml b/scripts/train/2.5/language/cu.yaml new file mode 100644 index 000000000..e286380c7 --- /dev/null +++ b/scripts/train/2.5/language/cu.yaml @@ -0,0 +1,12 @@ +dev_files: + cu_proiel: corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu +language_codes: +- cu_proiel +language_map: + cu: cu_proiel + cu_proiel: cu_proiel + old church slavonic: cu_proiel +test_files: + cu_proiel: corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu +train_files: + cu_proiel: corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu diff --git a/scripts/train/2.5/language/da.yaml b/scripts/train/2.5/language/da.yaml new file mode 100644 index 000000000..92a2d7052 --- /dev/null +++ b/scripts/train/2.5/language/da.yaml @@ -0,0 +1,12 @@ +dev_files: + da_ddt: corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-dev.conllu +language_codes: +- da_ddt +language_map: + da: da_ddt + da_ddt: da_ddt + danish: da_ddt +test_files: + da_ddt: corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-test.conllu +train_files: + da_ddt: corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-train.conllu diff --git a/scripts/train/2.5/language/de.yaml b/scripts/train/2.5/language/de.yaml new file mode 100644 index 000000000..d8318b1ba --- /dev/null +++ b/scripts/train/2.5/language/de.yaml @@ -0,0 +1,20 @@ +dev_files: + de_gsd: corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-dev.conllu + de_hdt: corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-dev.conllu +language_codes: +- de_hdt +- de_pud +- de_gsd +language_map: + de: de_hdt + de_gsd: de_gsd + de_hdt: de_hdt + de_pud: de_pud + german: de_hdt +test_files: + de_gsd: corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-test.conllu + de_hdt: corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-test.conllu +train_files: + de_gsd: corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-train.conllu + de_hdt: corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-train.conllu + de_pud: corpus/ud-treebanks-v2.5/UD_German-PUD/de_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/el.yaml b/scripts/train/2.5/language/el.yaml new file mode 100644 index 000000000..9e815a8fe --- /dev/null +++ b/scripts/train/2.5/language/el.yaml @@ -0,0 +1,12 @@ +dev_files: + el_gdt: corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-dev.conllu +language_codes: +- el_gdt +language_map: + el: el_gdt + el_gdt: el_gdt + greek: el_gdt +test_files: + el_gdt: corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-test.conllu +train_files: + el_gdt: corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-train.conllu diff --git a/scripts/train/2.5/language/en.yaml b/scripts/train/2.5/language/en.yaml new file mode 100644 index 000000000..d91c0e00b --- /dev/null +++ b/scripts/train/2.5/language/en.yaml @@ -0,0 +1,35 @@ +dev_files: + en_esl: corpus/ud-treebanks-v2.5/UD_English-ESL/en_esl-ud-dev.conllu + en_ewt: corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-dev.conllu + en_gum: corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-dev.conllu + en_lines: corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-dev.conllu + en_partut: corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-dev.conllu +language_codes: +- en_esl +- en_pud +- en_gum +- en_ewt +- en_lines +- en_partut +language_map: + en: en_ewt + en_esl: en_esl + en_ewt: en_ewt + en_gum: en_gum + en_lines: en_lines + en_partut: en_partut + en_pud: en_pud + english: en_ewt +test_files: + en_esl: corpus/ud-treebanks-v2.5/UD_English-ESL/en_esl-ud-test.conllu + en_ewt: corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-test.conllu + en_gum: corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-test.conllu + en_lines: corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-test.conllu + en_partut: corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-test.conllu +train_files: + en_esl: corpus/ud-treebanks-v2.5/UD_English-ESL/en_esl-ud-train.conllu + en_ewt: corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-train.conllu + en_gum: corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-train.conllu + en_lines: corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-train.conllu + en_partut: corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-train.conllu + en_pud: corpus/ud-treebanks-v2.5/UD_English-PUD/en_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/es.yaml b/scripts/train/2.5/language/es.yaml new file mode 100644 index 000000000..59b58acda --- /dev/null +++ b/scripts/train/2.5/language/es.yaml @@ -0,0 +1,20 @@ +dev_files: + es_ancora: corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-dev.conllu + es_gsd: corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-dev.conllu +language_codes: +- es_ancora +- es_gsd +- es_pud +language_map: + es: es_gsd + es_ancora: es_ancora + es_gsd: es_gsd + es_pud: es_pud + spanish: es_gsd +test_files: + es_ancora: corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-test.conllu + es_gsd: corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-test.conllu +train_files: + es_ancora: corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-train.conllu + es_gsd: corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-train.conllu + es_pud: corpus/ud-treebanks-v2.5/UD_Spanish-PUD/es_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/et.yaml b/scripts/train/2.5/language/et.yaml new file mode 100644 index 000000000..2769e6758 --- /dev/null +++ b/scripts/train/2.5/language/et.yaml @@ -0,0 +1,17 @@ +dev_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-dev.conllu + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu +language_codes: +- et_edt +- et_ewt +language_map: + estonian: et_edt + et: et_edt + et_edt: et_edt + et_ewt: et_ewt +test_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-test.conllu + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu +train_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-train.conllu + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-train.conllu diff --git a/scripts/train/2.5/language/eu.yaml b/scripts/train/2.5/language/eu.yaml new file mode 100644 index 000000000..a6b12983c --- /dev/null +++ b/scripts/train/2.5/language/eu.yaml @@ -0,0 +1,12 @@ +dev_files: + eu_bdt: corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-dev.conllu +language_codes: +- eu_bdt +language_map: + basque: eu_bdt + eu: eu_bdt + eu_bdt: eu_bdt +test_files: + eu_bdt: corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-test.conllu +train_files: + eu_bdt: corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-train.conllu diff --git a/scripts/train/2.5/language/fa.yaml b/scripts/train/2.5/language/fa.yaml new file mode 100644 index 000000000..06b0fb15c --- /dev/null +++ b/scripts/train/2.5/language/fa.yaml @@ -0,0 +1,12 @@ +dev_files: + fa_seraji: corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-dev.conllu +language_codes: +- fa_seraji +language_map: + fa: fa_seraji + fa_seraji: fa_seraji + persian: fa_seraji +test_files: + fa_seraji: corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-test.conllu +train_files: + fa_seraji: corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-train.conllu diff --git a/scripts/train/2.5/language/fi.yaml b/scripts/train/2.5/language/fi.yaml new file mode 100644 index 000000000..1a002e82c --- /dev/null +++ b/scripts/train/2.5/language/fi.yaml @@ -0,0 +1,20 @@ +dev_files: + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-dev.conllu + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-dev.conllu +language_codes: +- fi_tdt +- fi_ftb +- fi_pud +language_map: + fi: fi_tdt + fi_ftb: fi_ftb + fi_pud: fi_pud + fi_tdt: fi_tdt + finnish: fi_tdt +test_files: + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-test.conllu + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-test.conllu +train_files: + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-train.conllu + fi_pud: corpus/ud-treebanks-v2.5/UD_Finnish-PUD/fi_pud-ud-train.conllu + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-train.conllu diff --git a/scripts/train/2.5/language/fr.yaml b/scripts/train/2.5/language/fr.yaml new file mode 100644 index 000000000..08d050f41 --- /dev/null +++ b/scripts/train/2.5/language/fr.yaml @@ -0,0 +1,35 @@ +dev_files: + fr_ftb: corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-dev.conllu + fr_gsd: corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-dev.conllu + fr_partut: corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-dev.conllu + fr_sequoia: corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-dev.conllu + fr_spoken: corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-dev.conllu +language_codes: +- fr_sequoia +- fr_partut +- fr_spoken +- fr_ftb +- fr_gsd +- fr_pud +language_map: + fr: fr_gsd + fr_ftb: fr_ftb + fr_gsd: fr_gsd + fr_partut: fr_partut + fr_pud: fr_pud + fr_sequoia: fr_sequoia + fr_spoken: fr_spoken + french: fr_gsd +test_files: + fr_ftb: corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-test.conllu + fr_gsd: corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-test.conllu + fr_partut: corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-test.conllu + fr_sequoia: corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-test.conllu + fr_spoken: corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-test.conllu +train_files: + fr_ftb: corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-train.conllu + fr_gsd: corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-train.conllu + fr_partut: corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-train.conllu + fr_pud: corpus/ud-treebanks-v2.5/UD_French-PUD/fr_pud-ud-train.conllu + fr_sequoia: corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-train.conllu + fr_spoken: corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-train.conllu diff --git a/scripts/train/2.5/language/fro.yaml b/scripts/train/2.5/language/fro.yaml new file mode 100644 index 000000000..7c93886c3 --- /dev/null +++ b/scripts/train/2.5/language/fro.yaml @@ -0,0 +1,12 @@ +dev_files: + fro_srcmf: corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu +language_codes: +- fro_srcmf +language_map: + fro: fro_srcmf + fro_srcmf: fro_srcmf + old french: fro_srcmf +test_files: + fro_srcmf: corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu +train_files: + fro_srcmf: corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu diff --git a/scripts/train/2.5/language/ga.yaml b/scripts/train/2.5/language/ga.yaml new file mode 100644 index 000000000..ec0c12024 --- /dev/null +++ b/scripts/train/2.5/language/ga.yaml @@ -0,0 +1,12 @@ +dev_files: + ga_idt: corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-dev.conllu +language_codes: +- ga_idt +language_map: + ga: ga_idt + ga_idt: ga_idt + irish: ga_idt +test_files: + ga_idt: corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-test.conllu +train_files: + ga_idt: corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-train.conllu diff --git a/scripts/train/2.5/language/gd.yaml b/scripts/train/2.5/language/gd.yaml new file mode 100644 index 000000000..7fbab92ed --- /dev/null +++ b/scripts/train/2.5/language/gd.yaml @@ -0,0 +1,12 @@ +dev_files: + gd_arcosg: corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu +language_codes: +- gd_arcosg +language_map: + gd: gd_arcosg + gd_arcosg: gd_arcosg + scottish gaelic: gd_arcosg +test_files: + gd_arcosg: corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu +train_files: + gd_arcosg: corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu diff --git a/scripts/train/2.5/language/gl.yaml b/scripts/train/2.5/language/gl.yaml new file mode 100644 index 000000000..b05c40064 --- /dev/null +++ b/scripts/train/2.5/language/gl.yaml @@ -0,0 +1,17 @@ +dev_files: + gl_ctg: corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-dev.conllu + gl_treegal: corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-test.conllu +language_codes: +- gl_ctg +- gl_treegal +language_map: + galician: gl_ctg + gl: gl_ctg + gl_ctg: gl_ctg + gl_treegal: gl_treegal +test_files: + gl_ctg: corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-test.conllu + gl_treegal: corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-test.conllu +train_files: + gl_ctg: corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-train.conllu + gl_treegal: corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-train.conllu diff --git a/scripts/train/2.5/language/got.yaml b/scripts/train/2.5/language/got.yaml new file mode 100644 index 000000000..98e0adf32 --- /dev/null +++ b/scripts/train/2.5/language/got.yaml @@ -0,0 +1,12 @@ +dev_files: + got_proiel: corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-dev.conllu +language_codes: +- got_proiel +language_map: + got: got_proiel + got_proiel: got_proiel + gothic: got_proiel +test_files: + got_proiel: corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-test.conllu +train_files: + got_proiel: corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-train.conllu diff --git a/scripts/train/2.5/language/grc.yaml b/scripts/train/2.5/language/grc.yaml new file mode 100644 index 000000000..9316ff178 --- /dev/null +++ b/scripts/train/2.5/language/grc.yaml @@ -0,0 +1,17 @@ +dev_files: + grc_perseus: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu + grc_proiel: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu +language_codes: +- grc_perseus +- grc_proiel +language_map: + ancient greek: grc_proiel + grc: grc_proiel + grc_perseus: grc_perseus + grc_proiel: grc_proiel +test_files: + grc_perseus: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu + grc_proiel: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu +train_files: + grc_perseus: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu + grc_proiel: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu diff --git a/scripts/train/2.5/language/he.yaml b/scripts/train/2.5/language/he.yaml new file mode 100644 index 000000000..06b3a9c62 --- /dev/null +++ b/scripts/train/2.5/language/he.yaml @@ -0,0 +1,12 @@ +dev_files: + he_htb: corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-dev.conllu +language_codes: +- he_htb +language_map: + he: he_htb + he_htb: he_htb + hebrew: he_htb +test_files: + he_htb: corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-test.conllu +train_files: + he_htb: corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-train.conllu diff --git a/scripts/train/2.5/language/hi.yaml b/scripts/train/2.5/language/hi.yaml new file mode 100644 index 000000000..3ad011fc7 --- /dev/null +++ b/scripts/train/2.5/language/hi.yaml @@ -0,0 +1,15 @@ +dev_files: + hi_hdtb: corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu +language_codes: +- hi_pud +- hi_hdtb +language_map: + hi: hi_hdtb + hi_hdtb: hi_hdtb + hi_pud: hi_pud + hindi: hi_hdtb +test_files: + hi_hdtb: corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-test.conllu +train_files: + hi_hdtb: corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu + hi_pud: corpus/ud-treebanks-v2.5/UD_Hindi-PUD/hi_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/hr.yaml b/scripts/train/2.5/language/hr.yaml new file mode 100644 index 000000000..877775409 --- /dev/null +++ b/scripts/train/2.5/language/hr.yaml @@ -0,0 +1,12 @@ +dev_files: + hr_set: corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-dev.conllu +language_codes: +- hr_set +language_map: + croatian: hr_set + hr: hr_set + hr_set: hr_set +test_files: + hr_set: corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-test.conllu +train_files: + hr_set: corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-train.conllu diff --git a/scripts/train/2.5/language/hsb.yaml b/scripts/train/2.5/language/hsb.yaml new file mode 100644 index 000000000..f50783e33 --- /dev/null +++ b/scripts/train/2.5/language/hsb.yaml @@ -0,0 +1,12 @@ +dev_files: + hsb_ufal: corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu +language_codes: +- hsb_ufal +language_map: + hsb: hsb_ufal + hsb_ufal: hsb_ufal + upper sorbian: hsb_ufal +test_files: + hsb_ufal: corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu +train_files: + hsb_ufal: corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu diff --git a/scripts/train/2.5/language/hu.yaml b/scripts/train/2.5/language/hu.yaml new file mode 100644 index 000000000..0d698da78 --- /dev/null +++ b/scripts/train/2.5/language/hu.yaml @@ -0,0 +1,12 @@ +dev_files: + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu +language_codes: +- hu_szeged +language_map: + hu: hu_szeged + hu_szeged: hu_szeged + hungarian: hu_szeged +test_files: + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-test.conllu +train_files: + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu diff --git a/scripts/train/2.5/language/hy.yaml b/scripts/train/2.5/language/hy.yaml new file mode 100644 index 000000000..6813be43b --- /dev/null +++ b/scripts/train/2.5/language/hy.yaml @@ -0,0 +1,12 @@ +dev_files: + hy_armtdp: corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu +language_codes: +- hy_armtdp +language_map: + armenian: hy_armtdp + hy: hy_armtdp + hy_armtdp: hy_armtdp +test_files: + hy_armtdp: corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu +train_files: + hy_armtdp: corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu diff --git a/scripts/train/2.5/language/id.yaml b/scripts/train/2.5/language/id.yaml new file mode 100644 index 000000000..bbf08cdbf --- /dev/null +++ b/scripts/train/2.5/language/id.yaml @@ -0,0 +1,15 @@ +dev_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-dev.conllu +language_codes: +- id_pud +- id_gsd +language_map: + id: id_gsd + id_gsd: id_gsd + id_pud: id_pud + indonesian: id_gsd +test_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-test.conllu +train_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-train.conllu + id_pud: corpus/ud-treebanks-v2.5/UD_Indonesian-PUD/id_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/it.yaml b/scripts/train/2.5/language/it.yaml new file mode 100644 index 000000000..6c3cad744 --- /dev/null +++ b/scripts/train/2.5/language/it.yaml @@ -0,0 +1,35 @@ +dev_files: + it_isdt: corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-dev.conllu + it_partut: corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-dev.conllu + it_postwita: corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu + it_twittiro: corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu + it_vit: corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-dev.conllu +language_codes: +- it_twittiro +- it_isdt +- it_pud +- it_postwita +- it_partut +- it_vit +language_map: + it: it_isdt + it_isdt: it_isdt + it_partut: it_partut + it_postwita: it_postwita + it_pud: it_pud + it_twittiro: it_twittiro + it_vit: it_vit + italian: it_isdt +test_files: + it_isdt: corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-test.conllu + it_partut: corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-test.conllu + it_postwita: corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-test.conllu + it_twittiro: corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu + it_vit: corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-test.conllu +train_files: + it_isdt: corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-train.conllu + it_partut: corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-train.conllu + it_postwita: corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-train.conllu + it_pud: corpus/ud-treebanks-v2.5/UD_Italian-PUD/it_pud-ud-train.conllu + it_twittiro: corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu + it_vit: corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-train.conllu diff --git a/scripts/train/2.5/language/ja.yaml b/scripts/train/2.5/language/ja.yaml new file mode 100644 index 000000000..15d1579d4 --- /dev/null +++ b/scripts/train/2.5/language/ja.yaml @@ -0,0 +1,20 @@ +dev_files: + ja_bccwj: corpus/ud-treebanks-v2.5/UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu + ja_gsd: corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-dev.conllu +language_codes: +- ja_gsd +- ja_bccwj +- ja_pud +language_map: + ja: ja_gsd + ja_bccwj: ja_bccwj + ja_gsd: ja_gsd + ja_pud: ja_pud + japanese: ja_gsd +test_files: + ja_bccwj: corpus/ud-treebanks-v2.5/UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu + ja_gsd: corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-test.conllu +train_files: + ja_bccwj: corpus/ud-treebanks-v2.5/UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu + ja_gsd: corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-train.conllu + ja_pud: corpus/ud-treebanks-v2.5/UD_Japanese-PUD/ja_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/kk.yaml b/scripts/train/2.5/language/kk.yaml new file mode 100644 index 000000000..9ac4c36b0 --- /dev/null +++ b/scripts/train/2.5/language/kk.yaml @@ -0,0 +1,12 @@ +dev_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +language_codes: +- kk_ktb +language_map: + kazakh: kk_ktb + kk: kk_ktb + kk_ktb: kk_ktb +test_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +train_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-train.conllu diff --git a/scripts/train/2.5/language/kmr.yaml b/scripts/train/2.5/language/kmr.yaml new file mode 100644 index 000000000..9e14854a5 --- /dev/null +++ b/scripts/train/2.5/language/kmr.yaml @@ -0,0 +1,12 @@ +dev_files: + kmr_mg: corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +language_codes: +- kmr_mg +language_map: + kmr: kmr_mg + kmr_mg: kmr_mg + kurmanji: kmr_mg +test_files: + kmr_mg: corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +train_files: + kmr_mg: corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-train.conllu diff --git a/scripts/train/2.5/language/ko.yaml b/scripts/train/2.5/language/ko.yaml new file mode 100644 index 000000000..b8194f554 --- /dev/null +++ b/scripts/train/2.5/language/ko.yaml @@ -0,0 +1,20 @@ +dev_files: + ko_gsd: corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-dev.conllu + ko_kaist: corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-dev.conllu +language_codes: +- ko_pud +- ko_kaist +- ko_gsd +language_map: + ko: ko_gsd + ko_gsd: ko_gsd + ko_kaist: ko_kaist + ko_pud: ko_pud + korean: ko_gsd +test_files: + ko_gsd: corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-test.conllu + ko_kaist: corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-test.conllu +train_files: + ko_gsd: corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-train.conllu + ko_kaist: corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-train.conllu + ko_pud: corpus/ud-treebanks-v2.5/UD_Korean-PUD/ko_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/la.yaml b/scripts/train/2.5/language/la.yaml new file mode 100644 index 000000000..4b812d620 --- /dev/null +++ b/scripts/train/2.5/language/la.yaml @@ -0,0 +1,22 @@ +dev_files: + la_ittb: corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-dev.conllu + la_perseus: corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-test.conllu + la_proiel: corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-dev.conllu +language_codes: +- la_ittb +- la_perseus +- la_proiel +language_map: + la: la_ittb + la_ittb: la_ittb + la_perseus: la_perseus + la_proiel: la_proiel + latin: la_ittb +test_files: + la_ittb: corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-test.conllu + la_perseus: corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-test.conllu + la_proiel: corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-test.conllu +train_files: + la_ittb: corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-train.conllu + la_perseus: corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-train.conllu + la_proiel: corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-train.conllu diff --git a/scripts/train/2.5/language/lt.yaml b/scripts/train/2.5/language/lt.yaml new file mode 100644 index 000000000..40c332b05 --- /dev/null +++ b/scripts/train/2.5/language/lt.yaml @@ -0,0 +1,17 @@ +dev_files: + lt_alksnis: corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu + lt_hse: corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-dev.conllu +language_codes: +- lt_alksnis +- lt_hse +language_map: + lithuanian: lt_alksnis + lt: lt_alksnis + lt_alksnis: lt_alksnis + lt_hse: lt_hse +test_files: + lt_alksnis: corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu + lt_hse: corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-test.conllu +train_files: + lt_alksnis: corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu + lt_hse: corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-train.conllu diff --git a/scripts/train/2.5/language/lv.yaml b/scripts/train/2.5/language/lv.yaml new file mode 100644 index 000000000..ce97b4191 --- /dev/null +++ b/scripts/train/2.5/language/lv.yaml @@ -0,0 +1,12 @@ +dev_files: + lv_lvtb: corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu +language_codes: +- lv_lvtb +language_map: + latvian: lv_lvtb + lv: lv_lvtb + lv_lvtb: lv_lvtb +test_files: + lv_lvtb: corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-test.conllu +train_files: + lv_lvtb: corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-train.conllu diff --git a/scripts/train/2.5/language/lzh.yaml b/scripts/train/2.5/language/lzh.yaml new file mode 100644 index 000000000..bcc0791b3 --- /dev/null +++ b/scripts/train/2.5/language/lzh.yaml @@ -0,0 +1,12 @@ +dev_files: + lzh_kyoto: corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu +language_codes: +- lzh_kyoto +language_map: + classical chinese: lzh_kyoto + lzh: lzh_kyoto + lzh_kyoto: lzh_kyoto +test_files: + lzh_kyoto: corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu +train_files: + lzh_kyoto: corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu diff --git a/scripts/train/2.5/language/mr.yaml b/scripts/train/2.5/language/mr.yaml new file mode 100644 index 000000000..2a719e041 --- /dev/null +++ b/scripts/train/2.5/language/mr.yaml @@ -0,0 +1,12 @@ +dev_files: + mr_ufal: corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-dev.conllu +language_codes: +- mr_ufal +language_map: + marathi: mr_ufal + mr: mr_ufal + mr_ufal: mr_ufal +test_files: + mr_ufal: corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-test.conllu +train_files: + mr_ufal: corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-train.conllu diff --git a/scripts/train/2.5/language/mt.yaml b/scripts/train/2.5/language/mt.yaml new file mode 100644 index 000000000..971df62a4 --- /dev/null +++ b/scripts/train/2.5/language/mt.yaml @@ -0,0 +1,12 @@ +dev_files: + mt_mudt: corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu +language_codes: +- mt_mudt +language_map: + maltese: mt_mudt + mt: mt_mudt + mt_mudt: mt_mudt +test_files: + mt_mudt: corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-test.conllu +train_files: + mt_mudt: corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-train.conllu diff --git a/scripts/train/2.5/language/nl.yaml b/scripts/train/2.5/language/nl.yaml new file mode 100644 index 000000000..f48243465 --- /dev/null +++ b/scripts/train/2.5/language/nl.yaml @@ -0,0 +1,17 @@ +dev_files: + nl_alpino: corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-dev.conllu + nl_lassysmall: corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu +language_codes: +- nl_alpino +- nl_lassysmall +language_map: + dutch: nl_alpino + nl: nl_alpino + nl_alpino: nl_alpino + nl_lassysmall: nl_lassysmall +test_files: + nl_alpino: corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-test.conllu + nl_lassysmall: corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu +train_files: + nl_alpino: corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-train.conllu + nl_lassysmall: corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu diff --git a/scripts/train/2.5/language/no.yaml b/scripts/train/2.5/language/no.yaml new file mode 100644 index 000000000..3fc7c6eb0 --- /dev/null +++ b/scripts/train/2.5/language/no.yaml @@ -0,0 +1,22 @@ +dev_files: + no_bokmaal: corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu + no_nynorsk: corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu + no_nynorsklia: corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu +language_codes: +- no_nynorsk +- no_nynorsklia +- no_bokmaal +language_map: + 'no': no_bokmaal + no_bokmaal: no_bokmaal + no_nynorsk: no_nynorsk + no_nynorsklia: no_nynorsklia + norwegian: no_bokmaal +test_files: + no_bokmaal: corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu + no_nynorsk: corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu + no_nynorsklia: corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu +train_files: + no_bokmaal: corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu + no_nynorsk: corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu + no_nynorsklia: corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu diff --git a/scripts/train/2.5/language/olo.yaml b/scripts/train/2.5/language/olo.yaml new file mode 100644 index 000000000..7da62d25b --- /dev/null +++ b/scripts/train/2.5/language/olo.yaml @@ -0,0 +1,12 @@ +dev_files: + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +language_codes: +- olo_kkpp +language_map: + livvi: olo_kkpp + olo: olo_kkpp + olo_kkpp: olo_kkpp +test_files: + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +train_files: + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu diff --git a/scripts/train/2.5/language/pl.yaml b/scripts/train/2.5/language/pl.yaml new file mode 100644 index 000000000..32924c0eb --- /dev/null +++ b/scripts/train/2.5/language/pl.yaml @@ -0,0 +1,20 @@ +dev_files: + pl_lfg: corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-dev.conllu + pl_pdb: corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-dev.conllu +language_codes: +- pl_pud +- pl_lfg +- pl_pdb +language_map: + pl: pl_pdb + pl_lfg: pl_lfg + pl_pdb: pl_pdb + pl_pud: pl_pud + polish: pl_pdb +test_files: + pl_lfg: corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-test.conllu + pl_pdb: corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-test.conllu +train_files: + pl_lfg: corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-train.conllu + pl_pdb: corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-train.conllu + pl_pud: corpus/ud-treebanks-v2.5/UD_Polish-PUD/pl_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/pt.yaml b/scripts/train/2.5/language/pt.yaml new file mode 100644 index 000000000..11e5103e4 --- /dev/null +++ b/scripts/train/2.5/language/pt.yaml @@ -0,0 +1,20 @@ +dev_files: + pt_bosque: corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu + pt_gsd: corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-dev.conllu +language_codes: +- pt_pud +- pt_gsd +- pt_bosque +language_map: + portuguese: pt_bosque + pt: pt_bosque + pt_bosque: pt_bosque + pt_gsd: pt_gsd + pt_pud: pt_pud +test_files: + pt_bosque: corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-test.conllu + pt_gsd: corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-test.conllu +train_files: + pt_bosque: corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-train.conllu + pt_gsd: corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-train.conllu + pt_pud: corpus/ud-treebanks-v2.5/UD_Portuguese-PUD/pt_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/ro.yaml b/scripts/train/2.5/language/ro.yaml new file mode 100644 index 000000000..c227cdcbe --- /dev/null +++ b/scripts/train/2.5/language/ro.yaml @@ -0,0 +1,17 @@ +dev_files: + ro_nonstandard: corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu + ro_rrt: corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-dev.conllu +language_codes: +- ro_nonstandard +- ro_rrt +language_map: + ro: ro_rrt + ro_nonstandard: ro_nonstandard + ro_rrt: ro_rrt + romanian: ro_rrt +test_files: + ro_nonstandard: corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu + ro_rrt: corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-test.conllu +train_files: + ro_nonstandard: corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu + ro_rrt: corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-train.conllu diff --git a/scripts/train/2.5/language/ru.yaml b/scripts/train/2.5/language/ru.yaml new file mode 100644 index 000000000..d1c41bbf2 --- /dev/null +++ b/scripts/train/2.5/language/ru.yaml @@ -0,0 +1,25 @@ +dev_files: + ru_gsd: corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-dev.conllu + ru_syntagrus: corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu + ru_taiga: corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-dev.conllu +language_codes: +- ru_gsd +- ru_pud +- ru_taiga +- ru_syntagrus +language_map: + ru: ru_syntagrus + ru_gsd: ru_gsd + ru_pud: ru_pud + ru_syntagrus: ru_syntagrus + ru_taiga: ru_taiga + russian: ru_syntagrus +test_files: + ru_gsd: corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-test.conllu + ru_syntagrus: corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu + ru_taiga: corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-test.conllu +train_files: + ru_gsd: corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-train.conllu + ru_pud: corpus/ud-treebanks-v2.5/UD_Russian-PUD/ru_pud-ud-train.conllu + ru_syntagrus: corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu + ru_taiga: corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-train.conllu diff --git a/scripts/train/2.5/language/sk.yaml b/scripts/train/2.5/language/sk.yaml new file mode 100644 index 000000000..4f267614e --- /dev/null +++ b/scripts/train/2.5/language/sk.yaml @@ -0,0 +1,12 @@ +dev_files: + sk_snk: corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-dev.conllu +language_codes: +- sk_snk +language_map: + sk: sk_snk + sk_snk: sk_snk + slovak: sk_snk +test_files: + sk_snk: corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-test.conllu +train_files: + sk_snk: corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-train.conllu diff --git a/scripts/train/2.5/language/sl.yaml b/scripts/train/2.5/language/sl.yaml new file mode 100644 index 000000000..224845ca7 --- /dev/null +++ b/scripts/train/2.5/language/sl.yaml @@ -0,0 +1,17 @@ +dev_files: + sl_ssj: corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu + sl_sst: corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-test.conllu +language_codes: +- sl_ssj +- sl_sst +language_map: + sl: sl_ssj + sl_ssj: sl_ssj + sl_sst: sl_sst + slovenian: sl_ssj +test_files: + sl_ssj: corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-test.conllu + sl_sst: corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-test.conllu +train_files: + sl_ssj: corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-train.conllu + sl_sst: corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-train.conllu diff --git a/scripts/train/2.5/language/sme.yaml b/scripts/train/2.5/language/sme.yaml new file mode 100644 index 000000000..448972339 --- /dev/null +++ b/scripts/train/2.5/language/sme.yaml @@ -0,0 +1,12 @@ +dev_files: + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu +language_codes: +- sme_giella +language_map: + north sami: sme_giella + sme: sme_giella + sme_giella: sme_giella +test_files: + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu +train_files: + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-train.conllu diff --git a/scripts/train/2.5/language/sr.yaml b/scripts/train/2.5/language/sr.yaml new file mode 100644 index 000000000..0c34b18c4 --- /dev/null +++ b/scripts/train/2.5/language/sr.yaml @@ -0,0 +1,12 @@ +dev_files: + sr_set: corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-dev.conllu +language_codes: +- sr_set +language_map: + serbian: sr_set + sr: sr_set + sr_set: sr_set +test_files: + sr_set: corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-test.conllu +train_files: + sr_set: corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-train.conllu diff --git a/scripts/train/2.5/language/sv.yaml b/scripts/train/2.5/language/sv.yaml new file mode 100644 index 000000000..cccb3ca62 --- /dev/null +++ b/scripts/train/2.5/language/sv.yaml @@ -0,0 +1,20 @@ +dev_files: + sv_lines: corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-dev.conllu + sv_talbanken: corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu +language_codes: +- sv_talbanken +- sv_lines +- sv_pud +language_map: + sv: sv_talbanken + sv_lines: sv_lines + sv_pud: sv_pud + sv_talbanken: sv_talbanken + swedish: sv_talbanken +test_files: + sv_lines: corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-test.conllu + sv_talbanken: corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu +train_files: + sv_lines: corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-train.conllu + sv_pud: corpus/ud-treebanks-v2.5/UD_Swedish-PUD/sv_pud-ud-train.conllu + sv_talbanken: corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu diff --git a/scripts/train/2.5/language/swl.yaml b/scripts/train/2.5/language/swl.yaml new file mode 100644 index 000000000..f1f89d077 --- /dev/null +++ b/scripts/train/2.5/language/swl.yaml @@ -0,0 +1,12 @@ +dev_files: + swl_sslc: corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu +language_codes: +- swl_sslc +language_map: + swedish sign language: swl_sslc + swl: swl_sslc + swl_sslc: swl_sslc +test_files: + swl_sslc: corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu +train_files: + swl_sslc: corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu diff --git a/scripts/train/2.5/language/ta.yaml b/scripts/train/2.5/language/ta.yaml new file mode 100644 index 000000000..15b97eeb0 --- /dev/null +++ b/scripts/train/2.5/language/ta.yaml @@ -0,0 +1,12 @@ +dev_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-dev.conllu +language_codes: +- ta_ttb +language_map: + ta: ta_ttb + ta_ttb: ta_ttb + tamil: ta_ttb +test_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-test.conllu +train_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-train.conllu diff --git a/scripts/train/2.5/language/te.yaml b/scripts/train/2.5/language/te.yaml new file mode 100644 index 000000000..0bf9087a4 --- /dev/null +++ b/scripts/train/2.5/language/te.yaml @@ -0,0 +1,12 @@ +dev_files: + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-dev.conllu +language_codes: +- te_mtg +language_map: + te: te_mtg + te_mtg: te_mtg + telugu: te_mtg +test_files: + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-test.conllu +train_files: + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-train.conllu diff --git a/scripts/train/2.5/language/tr.yaml b/scripts/train/2.5/language/tr.yaml new file mode 100644 index 000000000..6a7ca6e4a --- /dev/null +++ b/scripts/train/2.5/language/tr.yaml @@ -0,0 +1,15 @@ +dev_files: + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-dev.conllu +language_codes: +- tr_pud +- tr_imst +language_map: + tr: tr_imst + tr_imst: tr_imst + tr_pud: tr_pud + turkish: tr_imst +test_files: + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-test.conllu +train_files: + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-train.conllu + tr_pud: corpus/ud-treebanks-v2.5/UD_Turkish-PUD/tr_pud-ud-train.conllu diff --git a/scripts/train/2.5/language/ug.yaml b/scripts/train/2.5/language/ug.yaml new file mode 100644 index 000000000..0f88152bd --- /dev/null +++ b/scripts/train/2.5/language/ug.yaml @@ -0,0 +1,12 @@ +dev_files: + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-dev.conllu +language_codes: +- ug_udt +language_map: + ug: ug_udt + ug_udt: ug_udt + uyghur: ug_udt +test_files: + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-test.conllu +train_files: + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-train.conllu diff --git a/scripts/train/2.5/language/uk.yaml b/scripts/train/2.5/language/uk.yaml new file mode 100644 index 000000000..03cfa0803 --- /dev/null +++ b/scripts/train/2.5/language/uk.yaml @@ -0,0 +1,12 @@ +dev_files: + uk_iu: corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-dev.conllu +language_codes: +- uk_iu +language_map: + uk: uk_iu + uk_iu: uk_iu + ukrainian: uk_iu +test_files: + uk_iu: corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-test.conllu +train_files: + uk_iu: corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-train.conllu diff --git a/scripts/train/2.5/language/ur.yaml b/scripts/train/2.5/language/ur.yaml new file mode 100644 index 000000000..ccbf03d55 --- /dev/null +++ b/scripts/train/2.5/language/ur.yaml @@ -0,0 +1,12 @@ +dev_files: + ur_udtb: corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-dev.conllu +language_codes: +- ur_udtb +language_map: + ur: ur_udtb + ur_udtb: ur_udtb + urdu: ur_udtb +test_files: + ur_udtb: corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-test.conllu +train_files: + ur_udtb: corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-train.conllu diff --git a/scripts/train/2.5/language/vi.yaml b/scripts/train/2.5/language/vi.yaml new file mode 100644 index 000000000..3341261a2 --- /dev/null +++ b/scripts/train/2.5/language/vi.yaml @@ -0,0 +1,12 @@ +dev_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu +language_codes: +- vi_vtb +language_map: + vi: vi_vtb + vi_vtb: vi_vtb + vietnamese: vi_vtb +test_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-test.conllu +train_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu diff --git a/scripts/train/2.5/language/wo.yaml b/scripts/train/2.5/language/wo.yaml new file mode 100644 index 000000000..17ca88a87 --- /dev/null +++ b/scripts/train/2.5/language/wo.yaml @@ -0,0 +1,12 @@ +dev_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-dev.conllu +language_codes: +- wo_wtb +language_map: + wo: wo_wtb + wo_wtb: wo_wtb + wolof: wo_wtb +test_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-test.conllu +train_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-train.conllu diff --git a/scripts/train/2.5/language/zh.yaml b/scripts/train/2.5/language/zh.yaml new file mode 100644 index 000000000..5217418aa --- /dev/null +++ b/scripts/train/2.5/language/zh.yaml @@ -0,0 +1,20 @@ +dev_files: + zh_gsd: corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-dev.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu +language_codes: +- zh_gsdsimp +- zh_pud +- zh_gsd +language_map: + chinese: zh_gsdsimp + zh: zh_gsdsimp + zh_gsd: zh_gsd + zh_gsdsimp: zh_gsdsimp + zh_pud: zh_pud +test_files: + zh_gsd: corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-test.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu +train_files: + zh_gsd: corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-train.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu + zh_pud: corpus/ud-treebanks-v2.5/UD_Chinese-PUD/zh_pud-ud-train.conllu diff --git a/scripts/train/2.5/treebank/af_afribooms.yaml b/scripts/train/2.5/treebank/af_afribooms.yaml new file mode 100644 index 000000000..c3317d342 --- /dev/null +++ b/scripts/train/2.5/treebank/af_afribooms.yaml @@ -0,0 +1,10 @@ +dev_files: + af_afribooms: corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu +language_codes: +- af_afribooms +language_map: + af_afribooms: af_afribooms +test_files: + af_afribooms: corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu +train_files: + af_afribooms: corpus/ud-treebanks-v2.5/UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ar_nyuad.yaml b/scripts/train/2.5/treebank/ar_nyuad.yaml new file mode 100644 index 000000000..03f73bd9e --- /dev/null +++ b/scripts/train/2.5/treebank/ar_nyuad.yaml @@ -0,0 +1,10 @@ +dev_files: + ar_nyuad: corpus/ud-treebanks-v2.5/UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu +language_codes: +- ar_nyuad +language_map: + ar_nyuad: ar_nyuad +test_files: + ar_nyuad: corpus/ud-treebanks-v2.5/UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu +train_files: + ar_nyuad: corpus/ud-treebanks-v2.5/UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ar_padt.yaml b/scripts/train/2.5/treebank/ar_padt.yaml new file mode 100644 index 000000000..4bcb553d2 --- /dev/null +++ b/scripts/train/2.5/treebank/ar_padt.yaml @@ -0,0 +1,10 @@ +dev_files: + ar_padt: corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-dev.conllu +language_codes: +- ar_padt +language_map: + ar_padt: ar_padt +test_files: + ar_padt: corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-test.conllu +train_files: + ar_padt: corpus/ud-treebanks-v2.5/UD_Arabic-PADT/ar_padt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/be_hse.yaml b/scripts/train/2.5/treebank/be_hse.yaml new file mode 100644 index 000000000..c9fafb306 --- /dev/null +++ b/scripts/train/2.5/treebank/be_hse.yaml @@ -0,0 +1,10 @@ +dev_files: + be_hse: corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-dev.conllu +language_codes: +- be_hse +language_map: + be_hse: be_hse +test_files: + be_hse: corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-test.conllu +train_files: + be_hse: corpus/ud-treebanks-v2.5/UD_Belarusian-HSE/be_hse-ud-train.conllu diff --git a/scripts/train/2.5/treebank/bg_btb.yaml b/scripts/train/2.5/treebank/bg_btb.yaml new file mode 100644 index 000000000..a7340ec17 --- /dev/null +++ b/scripts/train/2.5/treebank/bg_btb.yaml @@ -0,0 +1,10 @@ +dev_files: + bg_btb: corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-dev.conllu +language_codes: +- bg_btb +language_map: + bg_btb: bg_btb +test_files: + bg_btb: corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-test.conllu +train_files: + bg_btb: corpus/ud-treebanks-v2.5/UD_Bulgarian-BTB/bg_btb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/bxr_bdt.yaml b/scripts/train/2.5/treebank/bxr_bdt.yaml new file mode 100644 index 000000000..3e736dded --- /dev/null +++ b/scripts/train/2.5/treebank/bxr_bdt.yaml @@ -0,0 +1,10 @@ +dev_files: + bxr_bdt: corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +language_codes: +- bxr_bdt +language_map: + bxr_bdt: bxr_bdt +test_files: + bxr_bdt: corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +train_files: + bxr_bdt: corpus/ud-treebanks-v2.5/UD_Buryat-BDT/bxr_bdt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ca_ancora.yaml b/scripts/train/2.5/treebank/ca_ancora.yaml new file mode 100644 index 000000000..306d91878 --- /dev/null +++ b/scripts/train/2.5/treebank/ca_ancora.yaml @@ -0,0 +1,10 @@ +dev_files: + ca_ancora: corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-dev.conllu +language_codes: +- ca_ancora +language_map: + ca_ancora: ca_ancora +test_files: + ca_ancora: corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-test.conllu +train_files: + ca_ancora: corpus/ud-treebanks-v2.5/UD_Catalan-AnCora/ca_ancora-ud-train.conllu diff --git a/scripts/train/2.5/treebank/cop_scriptorium.yaml b/scripts/train/2.5/treebank/cop_scriptorium.yaml new file mode 100644 index 000000000..e76d0127e --- /dev/null +++ b/scripts/train/2.5/treebank/cop_scriptorium.yaml @@ -0,0 +1,10 @@ +dev_files: + cop_scriptorium: corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu +language_codes: +- cop_scriptorium +language_map: + cop_scriptorium: cop_scriptorium +test_files: + cop_scriptorium: corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu +train_files: + cop_scriptorium: corpus/ud-treebanks-v2.5/UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu diff --git a/scripts/train/2.5/treebank/cs_cac.yaml b/scripts/train/2.5/treebank/cs_cac.yaml new file mode 100644 index 000000000..928587503 --- /dev/null +++ b/scripts/train/2.5/treebank/cs_cac.yaml @@ -0,0 +1,10 @@ +dev_files: + cs_cac: corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-dev.conllu +language_codes: +- cs_cac +language_map: + cs_cac: cs_cac +test_files: + cs_cac: corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-test.conllu +train_files: + cs_cac: corpus/ud-treebanks-v2.5/UD_Czech-CAC/cs_cac-ud-train.conllu diff --git a/scripts/train/2.5/treebank/cs_cltt.yaml b/scripts/train/2.5/treebank/cs_cltt.yaml new file mode 100644 index 000000000..64eb85c09 --- /dev/null +++ b/scripts/train/2.5/treebank/cs_cltt.yaml @@ -0,0 +1,10 @@ +dev_files: + cs_cltt: corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-dev.conllu +language_codes: +- cs_cltt +language_map: + cs_cltt: cs_cltt +test_files: + cs_cltt: corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-test.conllu +train_files: + cs_cltt: corpus/ud-treebanks-v2.5/UD_Czech-CLTT/cs_cltt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/cs_fictree.yaml b/scripts/train/2.5/treebank/cs_fictree.yaml new file mode 100644 index 000000000..606a91c6a --- /dev/null +++ b/scripts/train/2.5/treebank/cs_fictree.yaml @@ -0,0 +1,10 @@ +dev_files: + cs_fictree: corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-dev.conllu +language_codes: +- cs_fictree +language_map: + cs_fictree: cs_fictree +test_files: + cs_fictree: corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-test.conllu +train_files: + cs_fictree: corpus/ud-treebanks-v2.5/UD_Czech-FicTree/cs_fictree-ud-train.conllu diff --git a/scripts/train/2.5/treebank/cs_pdt.yaml b/scripts/train/2.5/treebank/cs_pdt.yaml new file mode 100644 index 000000000..dbfd2f4e7 --- /dev/null +++ b/scripts/train/2.5/treebank/cs_pdt.yaml @@ -0,0 +1,10 @@ +dev_files: + cs_pdt: corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-dev.conllu +language_codes: +- cs_pdt +language_map: + cs_pdt: cs_pdt +test_files: + cs_pdt: corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-test.conllu +train_files: + cs_pdt: corpus/ud-treebanks-v2.5/UD_Czech-PDT/cs_pdt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/cu_proiel.yaml b/scripts/train/2.5/treebank/cu_proiel.yaml new file mode 100644 index 000000000..78a49f468 --- /dev/null +++ b/scripts/train/2.5/treebank/cu_proiel.yaml @@ -0,0 +1,10 @@ +dev_files: + cu_proiel: corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu +language_codes: +- cu_proiel +language_map: + cu_proiel: cu_proiel +test_files: + cu_proiel: corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu +train_files: + cu_proiel: corpus/ud-treebanks-v2.5/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu diff --git a/scripts/train/2.5/treebank/da_ddt.yaml b/scripts/train/2.5/treebank/da_ddt.yaml new file mode 100644 index 000000000..91dd06b2d --- /dev/null +++ b/scripts/train/2.5/treebank/da_ddt.yaml @@ -0,0 +1,10 @@ +dev_files: + da_ddt: corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-dev.conllu +language_codes: +- da_ddt +language_map: + da_ddt: da_ddt +test_files: + da_ddt: corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-test.conllu +train_files: + da_ddt: corpus/ud-treebanks-v2.5/UD_Danish-DDT/da_ddt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/de_gsd.yaml b/scripts/train/2.5/treebank/de_gsd.yaml new file mode 100644 index 000000000..9380475df --- /dev/null +++ b/scripts/train/2.5/treebank/de_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + de_gsd: corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-dev.conllu +language_codes: +- de_gsd +language_map: + de_gsd: de_gsd +test_files: + de_gsd: corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-test.conllu +train_files: + de_gsd: corpus/ud-treebanks-v2.5/UD_German-GSD/de_gsd-ud-train.conllu diff --git a/scripts/train/2.5/treebank/de_hdt.yaml b/scripts/train/2.5/treebank/de_hdt.yaml new file mode 100644 index 000000000..79a24676a --- /dev/null +++ b/scripts/train/2.5/treebank/de_hdt.yaml @@ -0,0 +1,10 @@ +dev_files: + de_hdt: corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-dev.conllu +language_codes: +- de_hdt +language_map: + de_hdt: de_hdt +test_files: + de_hdt: corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-test.conllu +train_files: + de_hdt: corpus/ud-treebanks-v2.5/UD_German-HDT/de_hdt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/el_gdt.yaml b/scripts/train/2.5/treebank/el_gdt.yaml new file mode 100644 index 000000000..ce7951e74 --- /dev/null +++ b/scripts/train/2.5/treebank/el_gdt.yaml @@ -0,0 +1,10 @@ +dev_files: + el_gdt: corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-dev.conllu +language_codes: +- el_gdt +language_map: + el_gdt: el_gdt +test_files: + el_gdt: corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-test.conllu +train_files: + el_gdt: corpus/ud-treebanks-v2.5/UD_Greek-GDT/el_gdt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/en_esl.yaml b/scripts/train/2.5/treebank/en_esl.yaml new file mode 100644 index 000000000..54c9a3c27 --- /dev/null +++ b/scripts/train/2.5/treebank/en_esl.yaml @@ -0,0 +1,10 @@ +dev_files: + en_esl: corpus/ud-treebanks-v2.5/UD_English-ESL/en_esl-ud-dev.conllu +language_codes: +- en_esl +language_map: + en_esl: en_esl +test_files: + en_esl: corpus/ud-treebanks-v2.5/UD_English-ESL/en_esl-ud-test.conllu +train_files: + en_esl: corpus/ud-treebanks-v2.5/UD_English-ESL/en_esl-ud-train.conllu diff --git a/scripts/train/2.5/treebank/en_ewt.yaml b/scripts/train/2.5/treebank/en_ewt.yaml new file mode 100644 index 000000000..f39e4b303 --- /dev/null +++ b/scripts/train/2.5/treebank/en_ewt.yaml @@ -0,0 +1,10 @@ +dev_files: + en_ewt: corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-dev.conllu +language_codes: +- en_ewt +language_map: + en_ewt: en_ewt +test_files: + en_ewt: corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-test.conllu +train_files: + en_ewt: corpus/ud-treebanks-v2.5/UD_English-EWT/en_ewt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/en_gum.yaml b/scripts/train/2.5/treebank/en_gum.yaml new file mode 100644 index 000000000..a716e2c48 --- /dev/null +++ b/scripts/train/2.5/treebank/en_gum.yaml @@ -0,0 +1,10 @@ +dev_files: + en_gum: corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-dev.conllu +language_codes: +- en_gum +language_map: + en_gum: en_gum +test_files: + en_gum: corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-test.conllu +train_files: + en_gum: corpus/ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-train.conllu diff --git a/scripts/train/2.5/treebank/en_lines.yaml b/scripts/train/2.5/treebank/en_lines.yaml new file mode 100644 index 000000000..92f8c63bc --- /dev/null +++ b/scripts/train/2.5/treebank/en_lines.yaml @@ -0,0 +1,10 @@ +dev_files: + en_lines: corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-dev.conllu +language_codes: +- en_lines +language_map: + en_lines: en_lines +test_files: + en_lines: corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-test.conllu +train_files: + en_lines: corpus/ud-treebanks-v2.5/UD_English-LinES/en_lines-ud-train.conllu diff --git a/scripts/train/2.5/treebank/en_partut.yaml b/scripts/train/2.5/treebank/en_partut.yaml new file mode 100644 index 000000000..8367744c1 --- /dev/null +++ b/scripts/train/2.5/treebank/en_partut.yaml @@ -0,0 +1,10 @@ +dev_files: + en_partut: corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-dev.conllu +language_codes: +- en_partut +language_map: + en_partut: en_partut +test_files: + en_partut: corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-test.conllu +train_files: + en_partut: corpus/ud-treebanks-v2.5/UD_English-ParTUT/en_partut-ud-train.conllu diff --git a/scripts/train/2.5/treebank/es_ancora.yaml b/scripts/train/2.5/treebank/es_ancora.yaml new file mode 100644 index 000000000..4ab2db95e --- /dev/null +++ b/scripts/train/2.5/treebank/es_ancora.yaml @@ -0,0 +1,10 @@ +dev_files: + es_ancora: corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-dev.conllu +language_codes: +- es_ancora +language_map: + es_ancora: es_ancora +test_files: + es_ancora: corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-test.conllu +train_files: + es_ancora: corpus/ud-treebanks-v2.5/UD_Spanish-AnCora/es_ancora-ud-train.conllu diff --git a/scripts/train/2.5/treebank/es_gsd.yaml b/scripts/train/2.5/treebank/es_gsd.yaml new file mode 100644 index 000000000..aac626d5f --- /dev/null +++ b/scripts/train/2.5/treebank/es_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + es_gsd: corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-dev.conllu +language_codes: +- es_gsd +language_map: + es_gsd: es_gsd +test_files: + es_gsd: corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-test.conllu +train_files: + es_gsd: corpus/ud-treebanks-v2.5/UD_Spanish-GSD/es_gsd-ud-train.conllu diff --git a/scripts/train/2.5/treebank/et_edt.yaml b/scripts/train/2.5/treebank/et_edt.yaml new file mode 100644 index 000000000..edafd192f --- /dev/null +++ b/scripts/train/2.5/treebank/et_edt.yaml @@ -0,0 +1,10 @@ +dev_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-dev.conllu +language_codes: +- et_edt +language_map: + et_edt: et_edt +test_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-test.conllu +train_files: + et_edt: corpus/ud-treebanks-v2.5/UD_Estonian-EDT/et_edt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/et_ewt.yaml b/scripts/train/2.5/treebank/et_ewt.yaml new file mode 100644 index 000000000..a64ed77d4 --- /dev/null +++ b/scripts/train/2.5/treebank/et_ewt.yaml @@ -0,0 +1,10 @@ +dev_files: + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu +language_codes: +- et_ewt +language_map: + et_ewt: et_ewt +test_files: + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-test.conllu +train_files: + et_ewt: corpus/ud-treebanks-v2.5/UD_Estonian-EWT/et_ewt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/eu_bdt.yaml b/scripts/train/2.5/treebank/eu_bdt.yaml new file mode 100644 index 000000000..75a7796c8 --- /dev/null +++ b/scripts/train/2.5/treebank/eu_bdt.yaml @@ -0,0 +1,10 @@ +dev_files: + eu_bdt: corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-dev.conllu +language_codes: +- eu_bdt +language_map: + eu_bdt: eu_bdt +test_files: + eu_bdt: corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-test.conllu +train_files: + eu_bdt: corpus/ud-treebanks-v2.5/UD_Basque-BDT/eu_bdt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/fa_seraji.yaml b/scripts/train/2.5/treebank/fa_seraji.yaml new file mode 100644 index 000000000..e454cc652 --- /dev/null +++ b/scripts/train/2.5/treebank/fa_seraji.yaml @@ -0,0 +1,10 @@ +dev_files: + fa_seraji: corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-dev.conllu +language_codes: +- fa_seraji +language_map: + fa_seraji: fa_seraji +test_files: + fa_seraji: corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-test.conllu +train_files: + fa_seraji: corpus/ud-treebanks-v2.5/UD_Persian-Seraji/fa_seraji-ud-train.conllu diff --git a/scripts/train/2.5/treebank/fi_ftb.yaml b/scripts/train/2.5/treebank/fi_ftb.yaml new file mode 100644 index 000000000..d1fd10aaa --- /dev/null +++ b/scripts/train/2.5/treebank/fi_ftb.yaml @@ -0,0 +1,10 @@ +dev_files: + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-dev.conllu +language_codes: +- fi_ftb +language_map: + fi_ftb: fi_ftb +test_files: + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-test.conllu +train_files: + fi_ftb: corpus/ud-treebanks-v2.5/UD_Finnish-FTB/fi_ftb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/fi_tdt.yaml b/scripts/train/2.5/treebank/fi_tdt.yaml new file mode 100644 index 000000000..59e694c92 --- /dev/null +++ b/scripts/train/2.5/treebank/fi_tdt.yaml @@ -0,0 +1,10 @@ +dev_files: + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-dev.conllu +language_codes: +- fi_tdt +language_map: + fi_tdt: fi_tdt +test_files: + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-test.conllu +train_files: + fi_tdt: corpus/ud-treebanks-v2.5/UD_Finnish-TDT/fi_tdt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/fr_ftb.yaml b/scripts/train/2.5/treebank/fr_ftb.yaml new file mode 100644 index 000000000..49bc7e74c --- /dev/null +++ b/scripts/train/2.5/treebank/fr_ftb.yaml @@ -0,0 +1,10 @@ +dev_files: + fr_ftb: corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-dev.conllu +language_codes: +- fr_ftb +language_map: + fr_ftb: fr_ftb +test_files: + fr_ftb: corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-test.conllu +train_files: + fr_ftb: corpus/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/fr_gsd.yaml b/scripts/train/2.5/treebank/fr_gsd.yaml new file mode 100644 index 000000000..d68694c70 --- /dev/null +++ b/scripts/train/2.5/treebank/fr_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + fr_gsd: corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-dev.conllu +language_codes: +- fr_gsd +language_map: + fr_gsd: fr_gsd +test_files: + fr_gsd: corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-test.conllu +train_files: + fr_gsd: corpus/ud-treebanks-v2.5/UD_French-GSD/fr_gsd-ud-train.conllu diff --git a/scripts/train/2.5/treebank/fr_partut.yaml b/scripts/train/2.5/treebank/fr_partut.yaml new file mode 100644 index 000000000..4e5b1572a --- /dev/null +++ b/scripts/train/2.5/treebank/fr_partut.yaml @@ -0,0 +1,10 @@ +dev_files: + fr_partut: corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-dev.conllu +language_codes: +- fr_partut +language_map: + fr_partut: fr_partut +test_files: + fr_partut: corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-test.conllu +train_files: + fr_partut: corpus/ud-treebanks-v2.5/UD_French-ParTUT/fr_partut-ud-train.conllu diff --git a/scripts/train/2.5/treebank/fr_sequoia.yaml b/scripts/train/2.5/treebank/fr_sequoia.yaml new file mode 100644 index 000000000..37e137ba2 --- /dev/null +++ b/scripts/train/2.5/treebank/fr_sequoia.yaml @@ -0,0 +1,10 @@ +dev_files: + fr_sequoia: corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-dev.conllu +language_codes: +- fr_sequoia +language_map: + fr_sequoia: fr_sequoia +test_files: + fr_sequoia: corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-test.conllu +train_files: + fr_sequoia: corpus/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-train.conllu diff --git a/scripts/train/2.5/treebank/fr_spoken.yaml b/scripts/train/2.5/treebank/fr_spoken.yaml new file mode 100644 index 000000000..ba9abe326 --- /dev/null +++ b/scripts/train/2.5/treebank/fr_spoken.yaml @@ -0,0 +1,10 @@ +dev_files: + fr_spoken: corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-dev.conllu +language_codes: +- fr_spoken +language_map: + fr_spoken: fr_spoken +test_files: + fr_spoken: corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-test.conllu +train_files: + fr_spoken: corpus/ud-treebanks-v2.5/UD_French-Spoken/fr_spoken-ud-train.conllu diff --git a/scripts/train/2.5/treebank/fro_srcmf.yaml b/scripts/train/2.5/treebank/fro_srcmf.yaml new file mode 100644 index 000000000..009fc80e3 --- /dev/null +++ b/scripts/train/2.5/treebank/fro_srcmf.yaml @@ -0,0 +1,10 @@ +dev_files: + fro_srcmf: corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu +language_codes: +- fro_srcmf +language_map: + fro_srcmf: fro_srcmf +test_files: + fro_srcmf: corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu +train_files: + fro_srcmf: corpus/ud-treebanks-v2.5/UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ga_idt.yaml b/scripts/train/2.5/treebank/ga_idt.yaml new file mode 100644 index 000000000..3415ed8da --- /dev/null +++ b/scripts/train/2.5/treebank/ga_idt.yaml @@ -0,0 +1,10 @@ +dev_files: + ga_idt: corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-dev.conllu +language_codes: +- ga_idt +language_map: + ga_idt: ga_idt +test_files: + ga_idt: corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-test.conllu +train_files: + ga_idt: corpus/ud-treebanks-v2.5/UD_Irish-IDT/ga_idt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/gd_arcosg.yaml b/scripts/train/2.5/treebank/gd_arcosg.yaml new file mode 100644 index 000000000..1d1ffe115 --- /dev/null +++ b/scripts/train/2.5/treebank/gd_arcosg.yaml @@ -0,0 +1,10 @@ +dev_files: + gd_arcosg: corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu +language_codes: +- gd_arcosg +language_map: + gd_arcosg: gd_arcosg +test_files: + gd_arcosg: corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu +train_files: + gd_arcosg: corpus/ud-treebanks-v2.5/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu diff --git a/scripts/train/2.5/treebank/gl_ctg.yaml b/scripts/train/2.5/treebank/gl_ctg.yaml new file mode 100644 index 000000000..e766fccf1 --- /dev/null +++ b/scripts/train/2.5/treebank/gl_ctg.yaml @@ -0,0 +1,10 @@ +dev_files: + gl_ctg: corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-dev.conllu +language_codes: +- gl_ctg +language_map: + gl_ctg: gl_ctg +test_files: + gl_ctg: corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-test.conllu +train_files: + gl_ctg: corpus/ud-treebanks-v2.5/UD_Galician-CTG/gl_ctg-ud-train.conllu diff --git a/scripts/train/2.5/treebank/gl_treegal.yaml b/scripts/train/2.5/treebank/gl_treegal.yaml new file mode 100644 index 000000000..ea2a18f2e --- /dev/null +++ b/scripts/train/2.5/treebank/gl_treegal.yaml @@ -0,0 +1,10 @@ +dev_files: + gl_treegal: corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-test.conllu +language_codes: +- gl_treegal +language_map: + gl_treegal: gl_treegal +test_files: + gl_treegal: corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-test.conllu +train_files: + gl_treegal: corpus/ud-treebanks-v2.5/UD_Galician-TreeGal/gl_treegal-ud-train.conllu diff --git a/scripts/train/2.5/treebank/got_proiel.yaml b/scripts/train/2.5/treebank/got_proiel.yaml new file mode 100644 index 000000000..883de1d63 --- /dev/null +++ b/scripts/train/2.5/treebank/got_proiel.yaml @@ -0,0 +1,10 @@ +dev_files: + got_proiel: corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-dev.conllu +language_codes: +- got_proiel +language_map: + got_proiel: got_proiel +test_files: + got_proiel: corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-test.conllu +train_files: + got_proiel: corpus/ud-treebanks-v2.5/UD_Gothic-PROIEL/got_proiel-ud-train.conllu diff --git a/scripts/train/2.5/treebank/grc_perseus.yaml b/scripts/train/2.5/treebank/grc_perseus.yaml new file mode 100644 index 000000000..cfe1b6568 --- /dev/null +++ b/scripts/train/2.5/treebank/grc_perseus.yaml @@ -0,0 +1,10 @@ +dev_files: + grc_perseus: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu +language_codes: +- grc_perseus +language_map: + grc_perseus: grc_perseus +test_files: + grc_perseus: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu +train_files: + grc_perseus: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu diff --git a/scripts/train/2.5/treebank/grc_proiel.yaml b/scripts/train/2.5/treebank/grc_proiel.yaml new file mode 100644 index 000000000..f7e09c780 --- /dev/null +++ b/scripts/train/2.5/treebank/grc_proiel.yaml @@ -0,0 +1,10 @@ +dev_files: + grc_proiel: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu +language_codes: +- grc_proiel +language_map: + grc_proiel: grc_proiel +test_files: + grc_proiel: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu +train_files: + grc_proiel: corpus/ud-treebanks-v2.5/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu diff --git a/scripts/train/2.5/treebank/he_htb.yaml b/scripts/train/2.5/treebank/he_htb.yaml new file mode 100644 index 000000000..c20115c55 --- /dev/null +++ b/scripts/train/2.5/treebank/he_htb.yaml @@ -0,0 +1,10 @@ +dev_files: + he_htb: corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-dev.conllu +language_codes: +- he_htb +language_map: + he_htb: he_htb +test_files: + he_htb: corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-test.conllu +train_files: + he_htb: corpus/ud-treebanks-v2.5/UD_Hebrew-HTB/he_htb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/hi_hdtb.yaml b/scripts/train/2.5/treebank/hi_hdtb.yaml new file mode 100644 index 000000000..07edc3f2d --- /dev/null +++ b/scripts/train/2.5/treebank/hi_hdtb.yaml @@ -0,0 +1,10 @@ +dev_files: + hi_hdtb: corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu +language_codes: +- hi_hdtb +language_map: + hi_hdtb: hi_hdtb +test_files: + hi_hdtb: corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-test.conllu +train_files: + hi_hdtb: corpus/ud-treebanks-v2.5/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/hr_set.yaml b/scripts/train/2.5/treebank/hr_set.yaml new file mode 100644 index 000000000..d3a53eacd --- /dev/null +++ b/scripts/train/2.5/treebank/hr_set.yaml @@ -0,0 +1,10 @@ +dev_files: + hr_set: corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-dev.conllu +language_codes: +- hr_set +language_map: + hr_set: hr_set +test_files: + hr_set: corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-test.conllu +train_files: + hr_set: corpus/ud-treebanks-v2.5/UD_Croatian-SET/hr_set-ud-train.conllu diff --git a/scripts/train/2.5/treebank/hsb_ufal.yaml b/scripts/train/2.5/treebank/hsb_ufal.yaml new file mode 100644 index 000000000..9185af495 --- /dev/null +++ b/scripts/train/2.5/treebank/hsb_ufal.yaml @@ -0,0 +1,10 @@ +dev_files: + hsb_ufal: corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu +language_codes: +- hsb_ufal +language_map: + hsb_ufal: hsb_ufal +test_files: + hsb_ufal: corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu +train_files: + hsb_ufal: corpus/ud-treebanks-v2.5/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu diff --git a/scripts/train/2.5/treebank/hu_szeged.yaml b/scripts/train/2.5/treebank/hu_szeged.yaml new file mode 100644 index 000000000..156888cb2 --- /dev/null +++ b/scripts/train/2.5/treebank/hu_szeged.yaml @@ -0,0 +1,10 @@ +dev_files: + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu +language_codes: +- hu_szeged +language_map: + hu_szeged: hu_szeged +test_files: + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-test.conllu +train_files: + hu_szeged: corpus/ud-treebanks-v2.5/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu diff --git a/scripts/train/2.5/treebank/hy_armtdp.yaml b/scripts/train/2.5/treebank/hy_armtdp.yaml new file mode 100644 index 000000000..5079109fa --- /dev/null +++ b/scripts/train/2.5/treebank/hy_armtdp.yaml @@ -0,0 +1,10 @@ +dev_files: + hy_armtdp: corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu +language_codes: +- hy_armtdp +language_map: + hy_armtdp: hy_armtdp +test_files: + hy_armtdp: corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu +train_files: + hy_armtdp: corpus/ud-treebanks-v2.5/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu diff --git a/scripts/train/2.5/treebank/id_gsd.yaml b/scripts/train/2.5/treebank/id_gsd.yaml new file mode 100644 index 000000000..870eff375 --- /dev/null +++ b/scripts/train/2.5/treebank/id_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-dev.conllu +language_codes: +- id_gsd +language_map: + id_gsd: id_gsd +test_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-test.conllu +train_files: + id_gsd: corpus/ud-treebanks-v2.5/UD_Indonesian-GSD/id_gsd-ud-train.conllu diff --git a/scripts/train/2.5/treebank/it_isdt.yaml b/scripts/train/2.5/treebank/it_isdt.yaml new file mode 100644 index 000000000..5bf85f201 --- /dev/null +++ b/scripts/train/2.5/treebank/it_isdt.yaml @@ -0,0 +1,10 @@ +dev_files: + it_isdt: corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-dev.conllu +language_codes: +- it_isdt +language_map: + it_isdt: it_isdt +test_files: + it_isdt: corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-test.conllu +train_files: + it_isdt: corpus/ud-treebanks-v2.5/UD_Italian-ISDT/it_isdt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/it_partut.yaml b/scripts/train/2.5/treebank/it_partut.yaml new file mode 100644 index 000000000..7df5e3506 --- /dev/null +++ b/scripts/train/2.5/treebank/it_partut.yaml @@ -0,0 +1,10 @@ +dev_files: + it_partut: corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-dev.conllu +language_codes: +- it_partut +language_map: + it_partut: it_partut +test_files: + it_partut: corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-test.conllu +train_files: + it_partut: corpus/ud-treebanks-v2.5/UD_Italian-ParTUT/it_partut-ud-train.conllu diff --git a/scripts/train/2.5/treebank/it_postwita.yaml b/scripts/train/2.5/treebank/it_postwita.yaml new file mode 100644 index 000000000..c6e4d5ace --- /dev/null +++ b/scripts/train/2.5/treebank/it_postwita.yaml @@ -0,0 +1,10 @@ +dev_files: + it_postwita: corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu +language_codes: +- it_postwita +language_map: + it_postwita: it_postwita +test_files: + it_postwita: corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-test.conllu +train_files: + it_postwita: corpus/ud-treebanks-v2.5/UD_Italian-PoSTWITA/it_postwita-ud-train.conllu diff --git a/scripts/train/2.5/treebank/it_twittiro.yaml b/scripts/train/2.5/treebank/it_twittiro.yaml new file mode 100644 index 000000000..1b5a07c89 --- /dev/null +++ b/scripts/train/2.5/treebank/it_twittiro.yaml @@ -0,0 +1,10 @@ +dev_files: + it_twittiro: corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu +language_codes: +- it_twittiro +language_map: + it_twittiro: it_twittiro +test_files: + it_twittiro: corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu +train_files: + it_twittiro: corpus/ud-treebanks-v2.5/UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu diff --git a/scripts/train/2.5/treebank/it_vit.yaml b/scripts/train/2.5/treebank/it_vit.yaml new file mode 100644 index 000000000..e269b2075 --- /dev/null +++ b/scripts/train/2.5/treebank/it_vit.yaml @@ -0,0 +1,10 @@ +dev_files: + it_vit: corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-dev.conllu +language_codes: +- it_vit +language_map: + it_vit: it_vit +test_files: + it_vit: corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-test.conllu +train_files: + it_vit: corpus/ud-treebanks-v2.5/UD_Italian-VIT/it_vit-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ja_bccwj.yaml b/scripts/train/2.5/treebank/ja_bccwj.yaml new file mode 100644 index 000000000..c3eb3026c --- /dev/null +++ b/scripts/train/2.5/treebank/ja_bccwj.yaml @@ -0,0 +1,10 @@ +dev_files: + ja_bccwj: corpus/ud-treebanks-v2.5/UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu +language_codes: +- ja_bccwj +language_map: + ja_bccwj: ja_bccwj +test_files: + ja_bccwj: corpus/ud-treebanks-v2.5/UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu +train_files: + ja_bccwj: corpus/ud-treebanks-v2.5/UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ja_gsd.yaml b/scripts/train/2.5/treebank/ja_gsd.yaml new file mode 100644 index 000000000..374290818 --- /dev/null +++ b/scripts/train/2.5/treebank/ja_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + ja_gsd: corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-dev.conllu +language_codes: +- ja_gsd +language_map: + ja_gsd: ja_gsd +test_files: + ja_gsd: corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-test.conllu +train_files: + ja_gsd: corpus/ud-treebanks-v2.5/UD_Japanese-GSD/ja_gsd-ud-train.conllu diff --git a/scripts/train/2.5/treebank/kk_ktb.yaml b/scripts/train/2.5/treebank/kk_ktb.yaml new file mode 100644 index 000000000..9c998529d --- /dev/null +++ b/scripts/train/2.5/treebank/kk_ktb.yaml @@ -0,0 +1,10 @@ +dev_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +language_codes: +- kk_ktb +language_map: + kk_ktb: kk_ktb +test_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +train_files: + kk_ktb: corpus/ud-treebanks-v2.5/UD_Kazakh-KTB/kk_ktb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/kmr_mg.yaml b/scripts/train/2.5/treebank/kmr_mg.yaml new file mode 100644 index 000000000..eedd593a6 --- /dev/null +++ b/scripts/train/2.5/treebank/kmr_mg.yaml @@ -0,0 +1,10 @@ +dev_files: + kmr_mg: corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +language_codes: +- kmr_mg +language_map: + kmr_mg: kmr_mg +test_files: + kmr_mg: corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +train_files: + kmr_mg: corpus/ud-treebanks-v2.5/UD_Kurmanji-MG/kmr_mg-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ko_gsd.yaml b/scripts/train/2.5/treebank/ko_gsd.yaml new file mode 100644 index 000000000..e2abc988d --- /dev/null +++ b/scripts/train/2.5/treebank/ko_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + ko_gsd: corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-dev.conllu +language_codes: +- ko_gsd +language_map: + ko_gsd: ko_gsd +test_files: + ko_gsd: corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-test.conllu +train_files: + ko_gsd: corpus/ud-treebanks-v2.5/UD_Korean-GSD/ko_gsd-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ko_kaist.yaml b/scripts/train/2.5/treebank/ko_kaist.yaml new file mode 100644 index 000000000..89c926cc5 --- /dev/null +++ b/scripts/train/2.5/treebank/ko_kaist.yaml @@ -0,0 +1,10 @@ +dev_files: + ko_kaist: corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-dev.conllu +language_codes: +- ko_kaist +language_map: + ko_kaist: ko_kaist +test_files: + ko_kaist: corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-test.conllu +train_files: + ko_kaist: corpus/ud-treebanks-v2.5/UD_Korean-Kaist/ko_kaist-ud-train.conllu diff --git a/scripts/train/2.5/treebank/la_ittb.yaml b/scripts/train/2.5/treebank/la_ittb.yaml new file mode 100644 index 000000000..daea5fd67 --- /dev/null +++ b/scripts/train/2.5/treebank/la_ittb.yaml @@ -0,0 +1,10 @@ +dev_files: + la_ittb: corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-dev.conllu +language_codes: +- la_ittb +language_map: + la_ittb: la_ittb +test_files: + la_ittb: corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-test.conllu +train_files: + la_ittb: corpus/ud-treebanks-v2.5/UD_Latin-ITTB/la_ittb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/la_perseus.yaml b/scripts/train/2.5/treebank/la_perseus.yaml new file mode 100644 index 000000000..98577b1f4 --- /dev/null +++ b/scripts/train/2.5/treebank/la_perseus.yaml @@ -0,0 +1,10 @@ +dev_files: + la_perseus: corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-test.conllu +language_codes: +- la_perseus +language_map: + la_perseus: la_perseus +test_files: + la_perseus: corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-test.conllu +train_files: + la_perseus: corpus/ud-treebanks-v2.5/UD_Latin-Perseus/la_perseus-ud-train.conllu diff --git a/scripts/train/2.5/treebank/la_proiel.yaml b/scripts/train/2.5/treebank/la_proiel.yaml new file mode 100644 index 000000000..f52e937de --- /dev/null +++ b/scripts/train/2.5/treebank/la_proiel.yaml @@ -0,0 +1,10 @@ +dev_files: + la_proiel: corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-dev.conllu +language_codes: +- la_proiel +language_map: + la_proiel: la_proiel +test_files: + la_proiel: corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-test.conllu +train_files: + la_proiel: corpus/ud-treebanks-v2.5/UD_Latin-PROIEL/la_proiel-ud-train.conllu diff --git a/scripts/train/2.5/treebank/lt_alksnis.yaml b/scripts/train/2.5/treebank/lt_alksnis.yaml new file mode 100644 index 000000000..303beb619 --- /dev/null +++ b/scripts/train/2.5/treebank/lt_alksnis.yaml @@ -0,0 +1,10 @@ +dev_files: + lt_alksnis: corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu +language_codes: +- lt_alksnis +language_map: + lt_alksnis: lt_alksnis +test_files: + lt_alksnis: corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu +train_files: + lt_alksnis: corpus/ud-treebanks-v2.5/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu diff --git a/scripts/train/2.5/treebank/lt_hse.yaml b/scripts/train/2.5/treebank/lt_hse.yaml new file mode 100644 index 000000000..83aa0897d --- /dev/null +++ b/scripts/train/2.5/treebank/lt_hse.yaml @@ -0,0 +1,10 @@ +dev_files: + lt_hse: corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-dev.conllu +language_codes: +- lt_hse +language_map: + lt_hse: lt_hse +test_files: + lt_hse: corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-test.conllu +train_files: + lt_hse: corpus/ud-treebanks-v2.5/UD_Lithuanian-HSE/lt_hse-ud-train.conllu diff --git a/scripts/train/2.5/treebank/lv_lvtb.yaml b/scripts/train/2.5/treebank/lv_lvtb.yaml new file mode 100644 index 000000000..b8de58296 --- /dev/null +++ b/scripts/train/2.5/treebank/lv_lvtb.yaml @@ -0,0 +1,10 @@ +dev_files: + lv_lvtb: corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu +language_codes: +- lv_lvtb +language_map: + lv_lvtb: lv_lvtb +test_files: + lv_lvtb: corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-test.conllu +train_files: + lv_lvtb: corpus/ud-treebanks-v2.5/UD_Latvian-LVTB/lv_lvtb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/lzh_kyoto.yaml b/scripts/train/2.5/treebank/lzh_kyoto.yaml new file mode 100644 index 000000000..5d6e04f89 --- /dev/null +++ b/scripts/train/2.5/treebank/lzh_kyoto.yaml @@ -0,0 +1,10 @@ +dev_files: + lzh_kyoto: corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu +language_codes: +- lzh_kyoto +language_map: + lzh_kyoto: lzh_kyoto +test_files: + lzh_kyoto: corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu +train_files: + lzh_kyoto: corpus/ud-treebanks-v2.5/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu diff --git a/scripts/train/2.5/treebank/mr_ufal.yaml b/scripts/train/2.5/treebank/mr_ufal.yaml new file mode 100644 index 000000000..b54477c10 --- /dev/null +++ b/scripts/train/2.5/treebank/mr_ufal.yaml @@ -0,0 +1,10 @@ +dev_files: + mr_ufal: corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-dev.conllu +language_codes: +- mr_ufal +language_map: + mr_ufal: mr_ufal +test_files: + mr_ufal: corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-test.conllu +train_files: + mr_ufal: corpus/ud-treebanks-v2.5/UD_Marathi-UFAL/mr_ufal-ud-train.conllu diff --git a/scripts/train/2.5/treebank/mt_mudt.yaml b/scripts/train/2.5/treebank/mt_mudt.yaml new file mode 100644 index 000000000..6383fbdb8 --- /dev/null +++ b/scripts/train/2.5/treebank/mt_mudt.yaml @@ -0,0 +1,10 @@ +dev_files: + mt_mudt: corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu +language_codes: +- mt_mudt +language_map: + mt_mudt: mt_mudt +test_files: + mt_mudt: corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-test.conllu +train_files: + mt_mudt: corpus/ud-treebanks-v2.5/UD_Maltese-MUDT/mt_mudt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/nl_alpino.yaml b/scripts/train/2.5/treebank/nl_alpino.yaml new file mode 100644 index 000000000..66561abac --- /dev/null +++ b/scripts/train/2.5/treebank/nl_alpino.yaml @@ -0,0 +1,10 @@ +dev_files: + nl_alpino: corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-dev.conllu +language_codes: +- nl_alpino +language_map: + nl_alpino: nl_alpino +test_files: + nl_alpino: corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-test.conllu +train_files: + nl_alpino: corpus/ud-treebanks-v2.5/UD_Dutch-Alpino/nl_alpino-ud-train.conllu diff --git a/scripts/train/2.5/treebank/nl_lassysmall.yaml b/scripts/train/2.5/treebank/nl_lassysmall.yaml new file mode 100644 index 000000000..d17d32b84 --- /dev/null +++ b/scripts/train/2.5/treebank/nl_lassysmall.yaml @@ -0,0 +1,10 @@ +dev_files: + nl_lassysmall: corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu +language_codes: +- nl_lassysmall +language_map: + nl_lassysmall: nl_lassysmall +test_files: + nl_lassysmall: corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu +train_files: + nl_lassysmall: corpus/ud-treebanks-v2.5/UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu diff --git a/scripts/train/2.5/treebank/no_bokmaal.yaml b/scripts/train/2.5/treebank/no_bokmaal.yaml new file mode 100644 index 000000000..432888902 --- /dev/null +++ b/scripts/train/2.5/treebank/no_bokmaal.yaml @@ -0,0 +1,10 @@ +dev_files: + no_bokmaal: corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu +language_codes: +- no_bokmaal +language_map: + no_bokmaal: no_bokmaal +test_files: + no_bokmaal: corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu +train_files: + no_bokmaal: corpus/ud-treebanks-v2.5/UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu diff --git a/scripts/train/2.5/treebank/no_nynorsk.yaml b/scripts/train/2.5/treebank/no_nynorsk.yaml new file mode 100644 index 000000000..09ff7f4e9 --- /dev/null +++ b/scripts/train/2.5/treebank/no_nynorsk.yaml @@ -0,0 +1,10 @@ +dev_files: + no_nynorsk: corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu +language_codes: +- no_nynorsk +language_map: + no_nynorsk: no_nynorsk +test_files: + no_nynorsk: corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu +train_files: + no_nynorsk: corpus/ud-treebanks-v2.5/UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu diff --git a/scripts/train/2.5/treebank/no_nynorsklia.yaml b/scripts/train/2.5/treebank/no_nynorsklia.yaml new file mode 100644 index 000000000..63de748cb --- /dev/null +++ b/scripts/train/2.5/treebank/no_nynorsklia.yaml @@ -0,0 +1,10 @@ +dev_files: + no_nynorsklia: corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu +language_codes: +- no_nynorsklia +language_map: + no_nynorsklia: no_nynorsklia +test_files: + no_nynorsklia: corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu +train_files: + no_nynorsklia: corpus/ud-treebanks-v2.5/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu diff --git a/scripts/train/2.5/treebank/olo_kkpp.yaml b/scripts/train/2.5/treebank/olo_kkpp.yaml new file mode 100644 index 000000000..adfba8194 --- /dev/null +++ b/scripts/train/2.5/treebank/olo_kkpp.yaml @@ -0,0 +1,10 @@ +dev_files: + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +language_codes: +- olo_kkpp +language_map: + olo_kkpp: olo_kkpp +test_files: + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +train_files: + olo_kkpp: corpus/ud-treebanks-v2.5/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu diff --git a/scripts/train/2.5/treebank/pl_lfg.yaml b/scripts/train/2.5/treebank/pl_lfg.yaml new file mode 100644 index 000000000..66959db28 --- /dev/null +++ b/scripts/train/2.5/treebank/pl_lfg.yaml @@ -0,0 +1,10 @@ +dev_files: + pl_lfg: corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-dev.conllu +language_codes: +- pl_lfg +language_map: + pl_lfg: pl_lfg +test_files: + pl_lfg: corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-test.conllu +train_files: + pl_lfg: corpus/ud-treebanks-v2.5/UD_Polish-LFG/pl_lfg-ud-train.conllu diff --git a/scripts/train/2.5/treebank/pl_pdb.yaml b/scripts/train/2.5/treebank/pl_pdb.yaml new file mode 100644 index 000000000..6bdc60222 --- /dev/null +++ b/scripts/train/2.5/treebank/pl_pdb.yaml @@ -0,0 +1,10 @@ +dev_files: + pl_pdb: corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-dev.conllu +language_codes: +- pl_pdb +language_map: + pl_pdb: pl_pdb +test_files: + pl_pdb: corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-test.conllu +train_files: + pl_pdb: corpus/ud-treebanks-v2.5/UD_Polish-PDB/pl_pdb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/pt_bosque.yaml b/scripts/train/2.5/treebank/pt_bosque.yaml new file mode 100644 index 000000000..3fa372d45 --- /dev/null +++ b/scripts/train/2.5/treebank/pt_bosque.yaml @@ -0,0 +1,10 @@ +dev_files: + pt_bosque: corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu +language_codes: +- pt_bosque +language_map: + pt_bosque: pt_bosque +test_files: + pt_bosque: corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-test.conllu +train_files: + pt_bosque: corpus/ud-treebanks-v2.5/UD_Portuguese-Bosque/pt_bosque-ud-train.conllu diff --git a/scripts/train/2.5/treebank/pt_gsd.yaml b/scripts/train/2.5/treebank/pt_gsd.yaml new file mode 100644 index 000000000..e020b9502 --- /dev/null +++ b/scripts/train/2.5/treebank/pt_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + pt_gsd: corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-dev.conllu +language_codes: +- pt_gsd +language_map: + pt_gsd: pt_gsd +test_files: + pt_gsd: corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-test.conllu +train_files: + pt_gsd: corpus/ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-train.conllu diff --git a/scripts/train/2.5/treebank/qhe_hiencs.yaml b/scripts/train/2.5/treebank/qhe_hiencs.yaml new file mode 100644 index 000000000..1c4b25d62 --- /dev/null +++ b/scripts/train/2.5/treebank/qhe_hiencs.yaml @@ -0,0 +1,10 @@ +dev_files: + qhe_hiencs: corpus/ud-treebanks-v2.5/UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu +language_codes: +- qhe_hiencs +language_map: + qhe_hiencs: qhe_hiencs +test_files: + qhe_hiencs: corpus/ud-treebanks-v2.5/UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu +train_files: + qhe_hiencs: corpus/ud-treebanks-v2.5/UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ro_nonstandard.yaml b/scripts/train/2.5/treebank/ro_nonstandard.yaml new file mode 100644 index 000000000..ef7e494d3 --- /dev/null +++ b/scripts/train/2.5/treebank/ro_nonstandard.yaml @@ -0,0 +1,10 @@ +dev_files: + ro_nonstandard: corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu +language_codes: +- ro_nonstandard +language_map: + ro_nonstandard: ro_nonstandard +test_files: + ro_nonstandard: corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu +train_files: + ro_nonstandard: corpus/ud-treebanks-v2.5/UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ro_rrt.yaml b/scripts/train/2.5/treebank/ro_rrt.yaml new file mode 100644 index 000000000..498286b56 --- /dev/null +++ b/scripts/train/2.5/treebank/ro_rrt.yaml @@ -0,0 +1,10 @@ +dev_files: + ro_rrt: corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-dev.conllu +language_codes: +- ro_rrt +language_map: + ro_rrt: ro_rrt +test_files: + ro_rrt: corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-test.conllu +train_files: + ro_rrt: corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ru_gsd.yaml b/scripts/train/2.5/treebank/ru_gsd.yaml new file mode 100644 index 000000000..2156739a7 --- /dev/null +++ b/scripts/train/2.5/treebank/ru_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + ru_gsd: corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-dev.conllu +language_codes: +- ru_gsd +language_map: + ru_gsd: ru_gsd +test_files: + ru_gsd: corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-test.conllu +train_files: + ru_gsd: corpus/ud-treebanks-v2.5/UD_Russian-GSD/ru_gsd-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ru_syntagrus.yaml b/scripts/train/2.5/treebank/ru_syntagrus.yaml new file mode 100644 index 000000000..023496547 --- /dev/null +++ b/scripts/train/2.5/treebank/ru_syntagrus.yaml @@ -0,0 +1,10 @@ +dev_files: + ru_syntagrus: corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu +language_codes: +- ru_syntagrus +language_map: + ru_syntagrus: ru_syntagrus +test_files: + ru_syntagrus: corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu +train_files: + ru_syntagrus: corpus/ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ru_taiga.yaml b/scripts/train/2.5/treebank/ru_taiga.yaml new file mode 100644 index 000000000..dc66be5ae --- /dev/null +++ b/scripts/train/2.5/treebank/ru_taiga.yaml @@ -0,0 +1,10 @@ +dev_files: + ru_taiga: corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-dev.conllu +language_codes: +- ru_taiga +language_map: + ru_taiga: ru_taiga +test_files: + ru_taiga: corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-test.conllu +train_files: + ru_taiga: corpus/ud-treebanks-v2.5/UD_Russian-Taiga/ru_taiga-ud-train.conllu diff --git a/scripts/train/2.5/treebank/sk_snk.yaml b/scripts/train/2.5/treebank/sk_snk.yaml new file mode 100644 index 000000000..d714933f1 --- /dev/null +++ b/scripts/train/2.5/treebank/sk_snk.yaml @@ -0,0 +1,10 @@ +dev_files: + sk_snk: corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-dev.conllu +language_codes: +- sk_snk +language_map: + sk_snk: sk_snk +test_files: + sk_snk: corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-test.conllu +train_files: + sk_snk: corpus/ud-treebanks-v2.5/UD_Slovak-SNK/sk_snk-ud-train.conllu diff --git a/scripts/train/2.5/treebank/sl_ssj.yaml b/scripts/train/2.5/treebank/sl_ssj.yaml new file mode 100644 index 000000000..57e1ecee2 --- /dev/null +++ b/scripts/train/2.5/treebank/sl_ssj.yaml @@ -0,0 +1,10 @@ +dev_files: + sl_ssj: corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu +language_codes: +- sl_ssj +language_map: + sl_ssj: sl_ssj +test_files: + sl_ssj: corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-test.conllu +train_files: + sl_ssj: corpus/ud-treebanks-v2.5/UD_Slovenian-SSJ/sl_ssj-ud-train.conllu diff --git a/scripts/train/2.5/treebank/sl_sst.yaml b/scripts/train/2.5/treebank/sl_sst.yaml new file mode 100644 index 000000000..a2334e6b5 --- /dev/null +++ b/scripts/train/2.5/treebank/sl_sst.yaml @@ -0,0 +1,10 @@ +dev_files: + sl_sst: corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-test.conllu +language_codes: +- sl_sst +language_map: + sl_sst: sl_sst +test_files: + sl_sst: corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-test.conllu +train_files: + sl_sst: corpus/ud-treebanks-v2.5/UD_Slovenian-SST/sl_sst-ud-train.conllu diff --git a/scripts/train/2.5/treebank/sme_giella.yaml b/scripts/train/2.5/treebank/sme_giella.yaml new file mode 100644 index 000000000..f8118b1fe --- /dev/null +++ b/scripts/train/2.5/treebank/sme_giella.yaml @@ -0,0 +1,10 @@ +dev_files: + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu +language_codes: +- sme_giella +language_map: + sme_giella: sme_giella +test_files: + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-test.conllu +train_files: + sme_giella: corpus/ud-treebanks-v2.5/UD_North_Sami-Giella/sme_giella-ud-train.conllu diff --git a/scripts/train/2.5/treebank/sr_set.yaml b/scripts/train/2.5/treebank/sr_set.yaml new file mode 100644 index 000000000..7b2e8d43a --- /dev/null +++ b/scripts/train/2.5/treebank/sr_set.yaml @@ -0,0 +1,10 @@ +dev_files: + sr_set: corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-dev.conllu +language_codes: +- sr_set +language_map: + sr_set: sr_set +test_files: + sr_set: corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-test.conllu +train_files: + sr_set: corpus/ud-treebanks-v2.5/UD_Serbian-SET/sr_set-ud-train.conllu diff --git a/scripts/train/2.5/treebank/sv_lines.yaml b/scripts/train/2.5/treebank/sv_lines.yaml new file mode 100644 index 000000000..723234526 --- /dev/null +++ b/scripts/train/2.5/treebank/sv_lines.yaml @@ -0,0 +1,10 @@ +dev_files: + sv_lines: corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-dev.conllu +language_codes: +- sv_lines +language_map: + sv_lines: sv_lines +test_files: + sv_lines: corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-test.conllu +train_files: + sv_lines: corpus/ud-treebanks-v2.5/UD_Swedish-LinES/sv_lines-ud-train.conllu diff --git a/scripts/train/2.5/treebank/sv_talbanken.yaml b/scripts/train/2.5/treebank/sv_talbanken.yaml new file mode 100644 index 000000000..9e0d3a419 --- /dev/null +++ b/scripts/train/2.5/treebank/sv_talbanken.yaml @@ -0,0 +1,10 @@ +dev_files: + sv_talbanken: corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu +language_codes: +- sv_talbanken +language_map: + sv_talbanken: sv_talbanken +test_files: + sv_talbanken: corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu +train_files: + sv_talbanken: corpus/ud-treebanks-v2.5/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu diff --git a/scripts/train/2.5/treebank/swl_sslc.yaml b/scripts/train/2.5/treebank/swl_sslc.yaml new file mode 100644 index 000000000..3a399a9db --- /dev/null +++ b/scripts/train/2.5/treebank/swl_sslc.yaml @@ -0,0 +1,10 @@ +dev_files: + swl_sslc: corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu +language_codes: +- swl_sslc +language_map: + swl_sslc: swl_sslc +test_files: + swl_sslc: corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu +train_files: + swl_sslc: corpus/ud-treebanks-v2.5/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ta_ttb.yaml b/scripts/train/2.5/treebank/ta_ttb.yaml new file mode 100644 index 000000000..ebad51526 --- /dev/null +++ b/scripts/train/2.5/treebank/ta_ttb.yaml @@ -0,0 +1,10 @@ +dev_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-dev.conllu +language_codes: +- ta_ttb +language_map: + ta_ttb: ta_ttb +test_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-test.conllu +train_files: + ta_ttb: corpus/ud-treebanks-v2.5/UD_Tamil-TTB/ta_ttb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/te_mtg.yaml b/scripts/train/2.5/treebank/te_mtg.yaml new file mode 100644 index 000000000..dcede6a40 --- /dev/null +++ b/scripts/train/2.5/treebank/te_mtg.yaml @@ -0,0 +1,10 @@ +dev_files: + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-dev.conllu +language_codes: +- te_mtg +language_map: + te_mtg: te_mtg +test_files: + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-test.conllu +train_files: + te_mtg: corpus/ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-train.conllu diff --git a/scripts/train/2.5/treebank/tr_imst.yaml b/scripts/train/2.5/treebank/tr_imst.yaml new file mode 100644 index 000000000..10ce0e91a --- /dev/null +++ b/scripts/train/2.5/treebank/tr_imst.yaml @@ -0,0 +1,10 @@ +dev_files: + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-dev.conllu +language_codes: +- tr_imst +language_map: + tr_imst: tr_imst +test_files: + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-test.conllu +train_files: + tr_imst: corpus/ud-treebanks-v2.5/UD_Turkish-IMST/tr_imst-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ug_udt.yaml b/scripts/train/2.5/treebank/ug_udt.yaml new file mode 100644 index 000000000..e5fcd9c5c --- /dev/null +++ b/scripts/train/2.5/treebank/ug_udt.yaml @@ -0,0 +1,10 @@ +dev_files: + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-dev.conllu +language_codes: +- ug_udt +language_map: + ug_udt: ug_udt +test_files: + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-test.conllu +train_files: + ug_udt: corpus/ud-treebanks-v2.5/UD_Uyghur-UDT/ug_udt-ud-train.conllu diff --git a/scripts/train/2.5/treebank/uk_iu.yaml b/scripts/train/2.5/treebank/uk_iu.yaml new file mode 100644 index 000000000..16807fbe5 --- /dev/null +++ b/scripts/train/2.5/treebank/uk_iu.yaml @@ -0,0 +1,10 @@ +dev_files: + uk_iu: corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-dev.conllu +language_codes: +- uk_iu +language_map: + uk_iu: uk_iu +test_files: + uk_iu: corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-test.conllu +train_files: + uk_iu: corpus/ud-treebanks-v2.5/UD_Ukrainian-IU/uk_iu-ud-train.conllu diff --git a/scripts/train/2.5/treebank/ur_udtb.yaml b/scripts/train/2.5/treebank/ur_udtb.yaml new file mode 100644 index 000000000..3d9ea8e41 --- /dev/null +++ b/scripts/train/2.5/treebank/ur_udtb.yaml @@ -0,0 +1,10 @@ +dev_files: + ur_udtb: corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-dev.conllu +language_codes: +- ur_udtb +language_map: + ur_udtb: ur_udtb +test_files: + ur_udtb: corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-test.conllu +train_files: + ur_udtb: corpus/ud-treebanks-v2.5/UD_Urdu-UDTB/ur_udtb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/vi_vtb.yaml b/scripts/train/2.5/treebank/vi_vtb.yaml new file mode 100644 index 000000000..cbf653e54 --- /dev/null +++ b/scripts/train/2.5/treebank/vi_vtb.yaml @@ -0,0 +1,10 @@ +dev_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu +language_codes: +- vi_vtb +language_map: + vi_vtb: vi_vtb +test_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-test.conllu +train_files: + vi_vtb: corpus/ud-treebanks-v2.5/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/wo_wtb.yaml b/scripts/train/2.5/treebank/wo_wtb.yaml new file mode 100644 index 000000000..452b82352 --- /dev/null +++ b/scripts/train/2.5/treebank/wo_wtb.yaml @@ -0,0 +1,10 @@ +dev_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-dev.conllu +language_codes: +- wo_wtb +language_map: + wo_wtb: wo_wtb +test_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-test.conllu +train_files: + wo_wtb: corpus/ud-treebanks-v2.5/UD_Wolof-WTB/wo_wtb-ud-train.conllu diff --git a/scripts/train/2.5/treebank/zh_gsd.yaml b/scripts/train/2.5/treebank/zh_gsd.yaml new file mode 100644 index 000000000..1e8889afb --- /dev/null +++ b/scripts/train/2.5/treebank/zh_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + zh_gsd: corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-dev.conllu +language_codes: +- zh_gsd +language_map: + zh_gsd: zh_gsd +test_files: + zh_gsd: corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-test.conllu +train_files: + zh_gsd: corpus/ud-treebanks-v2.5/UD_Chinese-GSD/zh_gsd-ud-train.conllu diff --git a/scripts/train/2.5/treebank/zh_gsdsimp.yaml b/scripts/train/2.5/treebank/zh_gsdsimp.yaml new file mode 100644 index 000000000..9782ded9d --- /dev/null +++ b/scripts/train/2.5/treebank/zh_gsdsimp.yaml @@ -0,0 +1,10 @@ +dev_files: + zh_gsdsimp: corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu +language_codes: +- zh_gsdsimp +language_map: + zh_gsdsimp: zh_gsdsimp +test_files: + zh_gsdsimp: corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu +train_files: + zh_gsdsimp: corpus/ud-treebanks-v2.5/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu diff --git a/scripts/train/2.7/family/armenian.yaml b/scripts/train/2.7/family/armenian.yaml new file mode 100644 index 000000000..1fa825fdd --- /dev/null +++ b/scripts/train/2.7/family/armenian.yaml @@ -0,0 +1,12 @@ +dev_files: + hy_armtdp: corpus/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu +language_codes: +- hy_armtdp +language_map: + armenian: hy_armtdp + hy: hy_armtdp + hy_armtdp: hy_armtdp +test_files: + hy_armtdp: corpus/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu +train_files: + hy_armtdp: corpus/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu diff --git a/scripts/train/2.7/family/austro-asiatic.yaml b/scripts/train/2.7/family/austro-asiatic.yaml new file mode 100644 index 000000000..d2f10054f --- /dev/null +++ b/scripts/train/2.7/family/austro-asiatic.yaml @@ -0,0 +1,12 @@ +dev_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu +language_codes: +- vi_vtb +language_map: + vi: vi_vtb + vi_vtb: vi_vtb + vietnamese: vi_vtb +test_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-test.conllu +train_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu diff --git a/scripts/train/2.7/family/austronesian.yaml b/scripts/train/2.7/family/austronesian.yaml new file mode 100644 index 000000000..95c7eacd5 --- /dev/null +++ b/scripts/train/2.7/family/austronesian.yaml @@ -0,0 +1,17 @@ +dev_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-test.conllu + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-dev.conllu +language_codes: +- id_csui +- id_gsd +language_map: + id: id_gsd + id_csui: id_csui + id_gsd: id_gsd + indonesian: id_gsd +test_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-test.conllu + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-test.conllu +train_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-train.conllu + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-train.conllu diff --git a/scripts/train/2.7/family/baltic.yaml b/scripts/train/2.7/family/baltic.yaml new file mode 100644 index 000000000..322e3dfde --- /dev/null +++ b/scripts/train/2.7/family/baltic.yaml @@ -0,0 +1,24 @@ +dev_files: + lt_alksnis: corpus/ud-treebanks-v2.7/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu + lt_hse: corpus/ud-treebanks-v2.7/UD_Lithuanian-HSE/lt_hse-ud-dev.conllu + lv_lvtb: corpus/ud-treebanks-v2.7/UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu +language_codes: +- lt_hse +- lt_alksnis +- lv_lvtb +language_map: + latvian: lv_lvtb + lithuanian: lt_alksnis + lt: lt_alksnis + lt_alksnis: lt_alksnis + lt_hse: lt_hse + lv: lv_lvtb + lv_lvtb: lv_lvtb +test_files: + lt_alksnis: corpus/ud-treebanks-v2.7/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu + lt_hse: corpus/ud-treebanks-v2.7/UD_Lithuanian-HSE/lt_hse-ud-test.conllu + lv_lvtb: corpus/ud-treebanks-v2.7/UD_Latvian-LVTB/lv_lvtb-ud-test.conllu +train_files: + lt_alksnis: corpus/ud-treebanks-v2.7/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu + lt_hse: corpus/ud-treebanks-v2.7/UD_Lithuanian-HSE/lt_hse-ud-train.conllu + lv_lvtb: corpus/ud-treebanks-v2.7/UD_Latvian-LVTB/lv_lvtb-ud-train.conllu diff --git a/scripts/train/2.7/family/basque.yaml b/scripts/train/2.7/family/basque.yaml new file mode 100644 index 000000000..eeaffa0b6 --- /dev/null +++ b/scripts/train/2.7/family/basque.yaml @@ -0,0 +1,12 @@ +dev_files: + eu_bdt: corpus/ud-treebanks-v2.7/UD_Basque-BDT/eu_bdt-ud-dev.conllu +language_codes: +- eu_bdt +language_map: + basque: eu_bdt + eu: eu_bdt + eu_bdt: eu_bdt +test_files: + eu_bdt: corpus/ud-treebanks-v2.7/UD_Basque-BDT/eu_bdt-ud-test.conllu +train_files: + eu_bdt: corpus/ud-treebanks-v2.7/UD_Basque-BDT/eu_bdt-ud-train.conllu diff --git a/scripts/train/2.7/family/celtic.yaml b/scripts/train/2.7/family/celtic.yaml new file mode 100644 index 000000000..daf92d838 --- /dev/null +++ b/scripts/train/2.7/family/celtic.yaml @@ -0,0 +1,26 @@ +dev_files: + cy_ccg: corpus/ud-treebanks-v2.7/UD_Welsh-CCG/cy_ccg-ud-test.conllu + ga_idt: corpus/ud-treebanks-v2.7/UD_Irish-IDT/ga_idt-ud-dev.conllu + gd_arcosg: corpus/ud-treebanks-v2.7/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu +language_codes: +- gd_arcosg +- ga_idt +- cy_ccg +language_map: + cy: cy_ccg + cy_ccg: cy_ccg + ga: ga_idt + ga_idt: ga_idt + gd: gd_arcosg + gd_arcosg: gd_arcosg + irish: ga_idt + scottish gaelic: gd_arcosg + welsh: cy_ccg +test_files: + cy_ccg: corpus/ud-treebanks-v2.7/UD_Welsh-CCG/cy_ccg-ud-test.conllu + ga_idt: corpus/ud-treebanks-v2.7/UD_Irish-IDT/ga_idt-ud-test.conllu + gd_arcosg: corpus/ud-treebanks-v2.7/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu +train_files: + cy_ccg: corpus/ud-treebanks-v2.7/UD_Welsh-CCG/cy_ccg-ud-train.conllu + ga_idt: corpus/ud-treebanks-v2.7/UD_Irish-IDT/ga_idt-ud-train.conllu + gd_arcosg: corpus/ud-treebanks-v2.7/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu diff --git a/scripts/train/2.7/family/creole.yaml b/scripts/train/2.7/family/creole.yaml new file mode 100644 index 000000000..15aaf4853 --- /dev/null +++ b/scripts/train/2.7/family/creole.yaml @@ -0,0 +1,12 @@ +dev_files: + pcm_nsc: corpus/ud-treebanks-v2.7/UD_Naija-NSC/pcm_nsc-ud-dev.conllu +language_codes: +- pcm_nsc +language_map: + naija: pcm_nsc + pcm: pcm_nsc + pcm_nsc: pcm_nsc +test_files: + pcm_nsc: corpus/ud-treebanks-v2.7/UD_Naija-NSC/pcm_nsc-ud-test.conllu +train_files: + pcm_nsc: corpus/ud-treebanks-v2.7/UD_Naija-NSC/pcm_nsc-ud-train.conllu diff --git a/scripts/train/2.7/family/dravidian.yaml b/scripts/train/2.7/family/dravidian.yaml new file mode 100644 index 000000000..7c85779dd --- /dev/null +++ b/scripts/train/2.7/family/dravidian.yaml @@ -0,0 +1,19 @@ +dev_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-dev.conllu + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-dev.conllu +language_codes: +- te_mtg +- ta_ttb +language_map: + ta: ta_ttb + ta_ttb: ta_ttb + tamil: ta_ttb + te: te_mtg + te_mtg: te_mtg + telugu: te_mtg +test_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-test.conllu + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-test.conllu +train_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-train.conllu + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-train.conllu diff --git a/scripts/train/2.7/family/egyptian.yaml b/scripts/train/2.7/family/egyptian.yaml new file mode 100644 index 000000000..1c15ecbf2 --- /dev/null +++ b/scripts/train/2.7/family/egyptian.yaml @@ -0,0 +1,12 @@ +dev_files: + cop_scriptorium: corpus/ud-treebanks-v2.7/UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu +language_codes: +- cop_scriptorium +language_map: + cop: cop_scriptorium + cop_scriptorium: cop_scriptorium + coptic: cop_scriptorium +test_files: + cop_scriptorium: corpus/ud-treebanks-v2.7/UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu +train_files: + cop_scriptorium: corpus/ud-treebanks-v2.7/UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu diff --git a/scripts/train/2.7/family/finnic.yaml b/scripts/train/2.7/family/finnic.yaml new file mode 100644 index 000000000..3edbac81c --- /dev/null +++ b/scripts/train/2.7/family/finnic.yaml @@ -0,0 +1,36 @@ +dev_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-dev.conllu + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-dev.conllu + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-dev.conllu + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-dev.conllu + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +language_codes: +- et_edt +- fi_ftb +- et_ewt +- olo_kkpp +- fi_tdt +language_map: + estonian: et_edt + et: et_edt + et_edt: et_edt + et_ewt: et_ewt + fi: fi_tdt + fi_ftb: fi_ftb + fi_tdt: fi_tdt + finnish: fi_tdt + livvi: olo_kkpp + olo: olo_kkpp + olo_kkpp: olo_kkpp +test_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-test.conllu + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-test.conllu + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-test.conllu + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-test.conllu + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +train_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-train.conllu + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-train.conllu + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-train.conllu + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-train.conllu + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu diff --git a/scripts/train/2.7/family/germanic.yaml b/scripts/train/2.7/family/germanic.yaml new file mode 100644 index 000000000..94db3de67 --- /dev/null +++ b/scripts/train/2.7/family/germanic.yaml @@ -0,0 +1,126 @@ +dev_files: + af_afribooms: corpus/ud-treebanks-v2.7/UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu + da_ddt: corpus/ud-treebanks-v2.7/UD_Danish-DDT/da_ddt-ud-dev.conllu + de_gsd: corpus/ud-treebanks-v2.7/UD_German-GSD/de_gsd-ud-dev.conllu + de_hdt: corpus/ud-treebanks-v2.7/UD_German-HDT/de_hdt-ud-dev.conllu + en_esl: corpus/ud-treebanks-v2.7/UD_English-ESL/en_esl-ud-dev.conllu + en_ewt: corpus/ud-treebanks-v2.7/UD_English-EWT/en_ewt-ud-dev.conllu + en_gum: corpus/ud-treebanks-v2.7/UD_English-GUM/en_gum-ud-dev.conllu + en_gumreddit: corpus/ud-treebanks-v2.7/UD_English-GUMReddit/en_gumreddit-ud-dev.conllu + en_lines: corpus/ud-treebanks-v2.7/UD_English-LinES/en_lines-ud-dev.conllu + en_partut: corpus/ud-treebanks-v2.7/UD_English-ParTUT/en_partut-ud-dev.conllu + fo_farpahc: corpus/ud-treebanks-v2.7/UD_Faroese-FarPaHC/fo_farpahc-ud-dev.conllu + got_proiel: corpus/ud-treebanks-v2.7/UD_Gothic-PROIEL/got_proiel-ud-dev.conllu + is_icepahc: corpus/ud-treebanks-v2.7/UD_Icelandic-IcePaHC/is_icepahc-ud-dev.conllu + nl_alpino: corpus/ud-treebanks-v2.7/UD_Dutch-Alpino/nl_alpino-ud-dev.conllu + nl_lassysmall: corpus/ud-treebanks-v2.7/UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu + no_bokmaal: corpus/ud-treebanks-v2.7/UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu + no_nynorsk: corpus/ud-treebanks-v2.7/UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu + no_nynorsklia: corpus/ud-treebanks-v2.7/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu + sv_lines: corpus/ud-treebanks-v2.7/UD_Swedish-LinES/sv_lines-ud-dev.conllu + sv_talbanken: corpus/ud-treebanks-v2.7/UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu +language_codes: +- fo_oft +- sv_talbanken +- en_lines +- en_ewt +- is_icepahc +- de_hdt +- got_proiel +- en_partut +- no_bokmaal +- nl_alpino +- sv_lines +- fo_farpahc +- de_gsd +- nl_lassysmall +- no_nynorsk +- no_nynorsklia +- af_afribooms +- da_ddt +- en_gum +- en_gumreddit +- en_esl +language_map: + af: af_afribooms + af_afribooms: af_afribooms + afrikaans: af_afribooms + da: da_ddt + da_ddt: da_ddt + danish: da_ddt + de: de_hdt + de_gsd: de_gsd + de_hdt: de_hdt + dutch: nl_alpino + en: en_ewt + en_esl: en_esl + en_ewt: en_ewt + en_gum: en_gum + en_gumreddit: en_gumreddit + en_lines: en_lines + en_partut: en_partut + english: en_ewt + faroese: fo_oft + fo: fo_oft + fo_farpahc: fo_farpahc + german: de_hdt + got: got_proiel + got_proiel: got_proiel + gothic: got_proiel + icelandic: is_icepahc + is: is_icepahc + is_icepahc: is_icepahc + nl: nl_alpino + nl_alpino: nl_alpino + nl_lassysmall: nl_lassysmall + 'no': no_bokmaal + no_bokmaal: no_bokmaal + no_nynorsk: no_nynorsk + no_nynorsklia: no_nynorsklia + norwegian: no_bokmaal + sv: sv_talbanken + sv_lines: sv_lines + sv_talbanken: sv_talbanken + swedish: sv_talbanken +test_files: + af_afribooms: corpus/ud-treebanks-v2.7/UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu + da_ddt: corpus/ud-treebanks-v2.7/UD_Danish-DDT/da_ddt-ud-test.conllu + de_gsd: corpus/ud-treebanks-v2.7/UD_German-GSD/de_gsd-ud-test.conllu + de_hdt: corpus/ud-treebanks-v2.7/UD_German-HDT/de_hdt-ud-test.conllu + en_esl: corpus/ud-treebanks-v2.7/UD_English-ESL/en_esl-ud-test.conllu + en_ewt: corpus/ud-treebanks-v2.7/UD_English-EWT/en_ewt-ud-test.conllu + en_gum: corpus/ud-treebanks-v2.7/UD_English-GUM/en_gum-ud-test.conllu + en_gumreddit: corpus/ud-treebanks-v2.7/UD_English-GUMReddit/en_gumreddit-ud-test.conllu + en_lines: corpus/ud-treebanks-v2.7/UD_English-LinES/en_lines-ud-test.conllu + en_partut: corpus/ud-treebanks-v2.7/UD_English-ParTUT/en_partut-ud-test.conllu + fo_farpahc: corpus/ud-treebanks-v2.7/UD_Faroese-FarPaHC/fo_farpahc-ud-test.conllu + got_proiel: corpus/ud-treebanks-v2.7/UD_Gothic-PROIEL/got_proiel-ud-test.conllu + is_icepahc: corpus/ud-treebanks-v2.7/UD_Icelandic-IcePaHC/is_icepahc-ud-test.conllu + nl_alpino: corpus/ud-treebanks-v2.7/UD_Dutch-Alpino/nl_alpino-ud-test.conllu + nl_lassysmall: corpus/ud-treebanks-v2.7/UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu + no_bokmaal: corpus/ud-treebanks-v2.7/UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu + no_nynorsk: corpus/ud-treebanks-v2.7/UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu + no_nynorsklia: corpus/ud-treebanks-v2.7/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu + sv_lines: corpus/ud-treebanks-v2.7/UD_Swedish-LinES/sv_lines-ud-test.conllu + sv_talbanken: corpus/ud-treebanks-v2.7/UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu +train_files: + af_afribooms: corpus/ud-treebanks-v2.7/UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu + da_ddt: corpus/ud-treebanks-v2.7/UD_Danish-DDT/da_ddt-ud-train.conllu + de_gsd: corpus/ud-treebanks-v2.7/UD_German-GSD/de_gsd-ud-train.conllu + de_hdt: corpus/ud-treebanks-v2.7/UD_German-HDT/de_hdt-ud-train.conllu + en_esl: corpus/ud-treebanks-v2.7/UD_English-ESL/en_esl-ud-train.conllu + en_ewt: corpus/ud-treebanks-v2.7/UD_English-EWT/en_ewt-ud-train.conllu + en_gum: corpus/ud-treebanks-v2.7/UD_English-GUM/en_gum-ud-train.conllu + en_gumreddit: corpus/ud-treebanks-v2.7/UD_English-GUMReddit/en_gumreddit-ud-train.conllu + en_lines: corpus/ud-treebanks-v2.7/UD_English-LinES/en_lines-ud-train.conllu + en_partut: corpus/ud-treebanks-v2.7/UD_English-ParTUT/en_partut-ud-train.conllu + fo_farpahc: corpus/ud-treebanks-v2.7/UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu + got_proiel: corpus/ud-treebanks-v2.7/UD_Gothic-PROIEL/got_proiel-ud-train.conllu + is_icepahc: corpus/ud-treebanks-v2.7/UD_Icelandic-IcePaHC/is_icepahc-ud-train.conllu + nl_alpino: corpus/ud-treebanks-v2.7/UD_Dutch-Alpino/nl_alpino-ud-train.conllu + nl_lassysmall: corpus/ud-treebanks-v2.7/UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu + no_bokmaal: corpus/ud-treebanks-v2.7/UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu + no_nynorsk: corpus/ud-treebanks-v2.7/UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu + no_nynorsklia: corpus/ud-treebanks-v2.7/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu + sv_lines: corpus/ud-treebanks-v2.7/UD_Swedish-LinES/sv_lines-ud-train.conllu + sv_talbanken: corpus/ud-treebanks-v2.7/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu diff --git a/scripts/train/2.7/family/greek.yaml b/scripts/train/2.7/family/greek.yaml new file mode 100644 index 000000000..53e902d4d --- /dev/null +++ b/scripts/train/2.7/family/greek.yaml @@ -0,0 +1,24 @@ +dev_files: + el_gdt: corpus/ud-treebanks-v2.7/UD_Greek-GDT/el_gdt-ud-dev.conllu + grc_perseus: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu + grc_proiel: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu +language_codes: +- grc_perseus +- grc_proiel +- el_gdt +language_map: + ancient greek: grc_proiel + el: el_gdt + el_gdt: el_gdt + grc: grc_proiel + grc_perseus: grc_perseus + grc_proiel: grc_proiel + greek: el_gdt +test_files: + el_gdt: corpus/ud-treebanks-v2.7/UD_Greek-GDT/el_gdt-ud-test.conllu + grc_perseus: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu + grc_proiel: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu +train_files: + el_gdt: corpus/ud-treebanks-v2.7/UD_Greek-GDT/el_gdt-ud-train.conllu + grc_perseus: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu + grc_proiel: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu diff --git a/scripts/train/2.7/family/indic.yaml b/scripts/train/2.7/family/indic.yaml new file mode 100644 index 000000000..8488b2776 --- /dev/null +++ b/scripts/train/2.7/family/indic.yaml @@ -0,0 +1,33 @@ +dev_files: + hi_hdtb: corpus/ud-treebanks-v2.7/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu + mr_ufal: corpus/ud-treebanks-v2.7/UD_Marathi-UFAL/mr_ufal-ud-dev.conllu + sa_vedic: corpus/ud-treebanks-v2.7/UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu + ur_udtb: corpus/ud-treebanks-v2.7/UD_Urdu-UDTB/ur_udtb-ud-dev.conllu +language_codes: +- sa_vedic +- hi_hdtb +- ur_udtb +- mr_ufal +language_map: + hi: hi_hdtb + hi_hdtb: hi_hdtb + hindi: hi_hdtb + marathi: mr_ufal + mr: mr_ufal + mr_ufal: mr_ufal + sa: sa_vedic + sa_vedic: sa_vedic + sanskrit: sa_vedic + ur: ur_udtb + ur_udtb: ur_udtb + urdu: ur_udtb +test_files: + hi_hdtb: corpus/ud-treebanks-v2.7/UD_Hindi-HDTB/hi_hdtb-ud-test.conllu + mr_ufal: corpus/ud-treebanks-v2.7/UD_Marathi-UFAL/mr_ufal-ud-test.conllu + sa_vedic: corpus/ud-treebanks-v2.7/UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu + ur_udtb: corpus/ud-treebanks-v2.7/UD_Urdu-UDTB/ur_udtb-ud-test.conllu +train_files: + hi_hdtb: corpus/ud-treebanks-v2.7/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu + mr_ufal: corpus/ud-treebanks-v2.7/UD_Marathi-UFAL/mr_ufal-ud-train.conllu + sa_vedic: corpus/ud-treebanks-v2.7/UD_Sanskrit-Vedic/sa_vedic-ud-train.conllu + ur_udtb: corpus/ud-treebanks-v2.7/UD_Urdu-UDTB/ur_udtb-ud-train.conllu diff --git a/scripts/train/2.7/family/iranian.yaml b/scripts/train/2.7/family/iranian.yaml new file mode 100644 index 000000000..4e695576c --- /dev/null +++ b/scripts/train/2.7/family/iranian.yaml @@ -0,0 +1,24 @@ +dev_files: + fa_perdt: corpus/ud-treebanks-v2.7/UD_Persian-PerDT/fa_perdt-ud-dev.conllu + fa_seraji: corpus/ud-treebanks-v2.7/UD_Persian-Seraji/fa_seraji-ud-dev.conllu + kmr_mg: corpus/ud-treebanks-v2.7/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +language_codes: +- fa_seraji +- kmr_mg +- fa_perdt +language_map: + fa: fa_seraji + fa_perdt: fa_perdt + fa_seraji: fa_seraji + kmr: kmr_mg + kmr_mg: kmr_mg + kurmanji: kmr_mg + persian: fa_seraji +test_files: + fa_perdt: corpus/ud-treebanks-v2.7/UD_Persian-PerDT/fa_perdt-ud-test.conllu + fa_seraji: corpus/ud-treebanks-v2.7/UD_Persian-Seraji/fa_seraji-ud-test.conllu + kmr_mg: corpus/ud-treebanks-v2.7/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +train_files: + fa_perdt: corpus/ud-treebanks-v2.7/UD_Persian-PerDT/fa_perdt-ud-train.conllu + fa_seraji: corpus/ud-treebanks-v2.7/UD_Persian-Seraji/fa_seraji-ud-train.conllu + kmr_mg: corpus/ud-treebanks-v2.7/UD_Kurmanji-MG/kmr_mg-ud-train.conllu diff --git a/scripts/train/2.7/family/japanese.yaml b/scripts/train/2.7/family/japanese.yaml new file mode 100644 index 000000000..7ef98ba08 --- /dev/null +++ b/scripts/train/2.7/family/japanese.yaml @@ -0,0 +1,17 @@ +dev_files: + ja_bccwj: corpus/ud-treebanks-v2.7/UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu + ja_gsd: corpus/ud-treebanks-v2.7/UD_Japanese-GSD/ja_gsd-ud-dev.conllu +language_codes: +- ja_bccwj +- ja_gsd +language_map: + ja: ja_gsd + ja_bccwj: ja_bccwj + ja_gsd: ja_gsd + japanese: ja_gsd +test_files: + ja_bccwj: corpus/ud-treebanks-v2.7/UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu + ja_gsd: corpus/ud-treebanks-v2.7/UD_Japanese-GSD/ja_gsd-ud-test.conllu +train_files: + ja_bccwj: corpus/ud-treebanks-v2.7/UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu + ja_gsd: corpus/ud-treebanks-v2.7/UD_Japanese-GSD/ja_gsd-ud-train.conllu diff --git a/scripts/train/2.7/family/korean.yaml b/scripts/train/2.7/family/korean.yaml new file mode 100644 index 000000000..1674811b5 --- /dev/null +++ b/scripts/train/2.7/family/korean.yaml @@ -0,0 +1,17 @@ +dev_files: + ko_gsd: corpus/ud-treebanks-v2.7/UD_Korean-GSD/ko_gsd-ud-dev.conllu + ko_kaist: corpus/ud-treebanks-v2.7/UD_Korean-Kaist/ko_kaist-ud-dev.conllu +language_codes: +- ko_gsd +- ko_kaist +language_map: + ko: ko_gsd + ko_gsd: ko_gsd + ko_kaist: ko_kaist + korean: ko_gsd +test_files: + ko_gsd: corpus/ud-treebanks-v2.7/UD_Korean-GSD/ko_gsd-ud-test.conllu + ko_kaist: corpus/ud-treebanks-v2.7/UD_Korean-Kaist/ko_kaist-ud-test.conllu +train_files: + ko_gsd: corpus/ud-treebanks-v2.7/UD_Korean-GSD/ko_gsd-ud-train.conllu + ko_kaist: corpus/ud-treebanks-v2.7/UD_Korean-Kaist/ko_kaist-ud-train.conllu diff --git a/scripts/train/2.7/family/latin.yaml b/scripts/train/2.7/family/latin.yaml new file mode 100644 index 000000000..9a0afe5ea --- /dev/null +++ b/scripts/train/2.7/family/latin.yaml @@ -0,0 +1,27 @@ +dev_files: + la_ittb: corpus/ud-treebanks-v2.7/UD_Latin-ITTB/la_ittb-ud-dev.conllu + la_llct: corpus/ud-treebanks-v2.7/UD_Latin-LLCT/la_llct-ud-dev.conllu + la_perseus: corpus/ud-treebanks-v2.7/UD_Latin-Perseus/la_perseus-ud-test.conllu + la_proiel: corpus/ud-treebanks-v2.7/UD_Latin-PROIEL/la_proiel-ud-dev.conllu +language_codes: +- la_perseus +- la_proiel +- la_ittb +- la_llct +language_map: + la: la_ittb + la_ittb: la_ittb + la_llct: la_llct + la_perseus: la_perseus + la_proiel: la_proiel + latin: la_ittb +test_files: + la_ittb: corpus/ud-treebanks-v2.7/UD_Latin-ITTB/la_ittb-ud-test.conllu + la_llct: corpus/ud-treebanks-v2.7/UD_Latin-LLCT/la_llct-ud-test.conllu + la_perseus: corpus/ud-treebanks-v2.7/UD_Latin-Perseus/la_perseus-ud-test.conllu + la_proiel: corpus/ud-treebanks-v2.7/UD_Latin-PROIEL/la_proiel-ud-test.conllu +train_files: + la_ittb: corpus/ud-treebanks-v2.7/UD_Latin-ITTB/la_ittb-ud-train.conllu + la_llct: corpus/ud-treebanks-v2.7/UD_Latin-LLCT/la_llct-ud-train.conllu + la_perseus: corpus/ud-treebanks-v2.7/UD_Latin-Perseus/la_perseus-ud-train.conllu + la_proiel: corpus/ud-treebanks-v2.7/UD_Latin-PROIEL/la_proiel-ud-train.conllu diff --git a/scripts/train/2.7/family/malayo-sumbawan.yaml b/scripts/train/2.7/family/malayo-sumbawan.yaml new file mode 100644 index 000000000..95c7eacd5 --- /dev/null +++ b/scripts/train/2.7/family/malayo-sumbawan.yaml @@ -0,0 +1,17 @@ +dev_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-test.conllu + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-dev.conllu +language_codes: +- id_csui +- id_gsd +language_map: + id: id_gsd + id_csui: id_csui + id_gsd: id_gsd + indonesian: id_gsd +test_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-test.conllu + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-test.conllu +train_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-train.conllu + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-train.conllu diff --git a/scripts/train/2.7/family/mongolic.yaml b/scripts/train/2.7/family/mongolic.yaml new file mode 100644 index 000000000..70260ceb9 --- /dev/null +++ b/scripts/train/2.7/family/mongolic.yaml @@ -0,0 +1,12 @@ +dev_files: + bxr_bdt: corpus/ud-treebanks-v2.7/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +language_codes: +- bxr_bdt +language_map: + buryat: bxr_bdt + bxr: bxr_bdt + bxr_bdt: bxr_bdt +test_files: + bxr_bdt: corpus/ud-treebanks-v2.7/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +train_files: + bxr_bdt: corpus/ud-treebanks-v2.7/UD_Buryat-BDT/bxr_bdt-ud-train.conllu diff --git a/scripts/train/2.7/family/niger-congo.yaml b/scripts/train/2.7/family/niger-congo.yaml new file mode 100644 index 000000000..22f0e9d47 --- /dev/null +++ b/scripts/train/2.7/family/niger-congo.yaml @@ -0,0 +1,12 @@ +dev_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-dev.conllu +language_codes: +- wo_wtb +language_map: + wo: wo_wtb + wo_wtb: wo_wtb + wolof: wo_wtb +test_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-test.conllu +train_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-train.conllu diff --git a/scripts/train/2.7/family/northern atlantic.yaml b/scripts/train/2.7/family/northern atlantic.yaml new file mode 100644 index 000000000..22f0e9d47 --- /dev/null +++ b/scripts/train/2.7/family/northern atlantic.yaml @@ -0,0 +1,12 @@ +dev_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-dev.conllu +language_codes: +- wo_wtb +language_map: + wo: wo_wtb + wo_wtb: wo_wtb + wolof: wo_wtb +test_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-test.conllu +train_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-train.conllu diff --git a/scripts/train/2.7/family/northwestern.yaml b/scripts/train/2.7/family/northwestern.yaml new file mode 100644 index 000000000..39a14ce30 --- /dev/null +++ b/scripts/train/2.7/family/northwestern.yaml @@ -0,0 +1,12 @@ +dev_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +language_codes: +- kk_ktb +language_map: + kazakh: kk_ktb + kk: kk_ktb + kk_ktb: kk_ktb +test_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +train_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-train.conllu diff --git a/scripts/train/2.7/family/romance.yaml b/scripts/train/2.7/family/romance.yaml new file mode 100644 index 000000000..313fe2a8e --- /dev/null +++ b/scripts/train/2.7/family/romance.yaml @@ -0,0 +1,121 @@ +dev_files: + ca_ancora: corpus/ud-treebanks-v2.7/UD_Catalan-AnCora/ca_ancora-ud-dev.conllu + es_ancora: corpus/ud-treebanks-v2.7/UD_Spanish-AnCora/es_ancora-ud-dev.conllu + es_gsd: corpus/ud-treebanks-v2.7/UD_Spanish-GSD/es_gsd-ud-dev.conllu + fr_ftb: corpus/ud-treebanks-v2.7/UD_French-FTB/fr_ftb-ud-dev.conllu + fr_gsd: corpus/ud-treebanks-v2.7/UD_French-GSD/fr_gsd-ud-dev.conllu + fr_partut: corpus/ud-treebanks-v2.7/UD_French-ParTUT/fr_partut-ud-dev.conllu + fr_sequoia: corpus/ud-treebanks-v2.7/UD_French-Sequoia/fr_sequoia-ud-dev.conllu + fr_spoken: corpus/ud-treebanks-v2.7/UD_French-Spoken/fr_spoken-ud-dev.conllu + fro_srcmf: corpus/ud-treebanks-v2.7/UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu + gl_ctg: corpus/ud-treebanks-v2.7/UD_Galician-CTG/gl_ctg-ud-dev.conllu + gl_treegal: corpus/ud-treebanks-v2.7/UD_Galician-TreeGal/gl_treegal-ud-test.conllu + it_isdt: corpus/ud-treebanks-v2.7/UD_Italian-ISDT/it_isdt-ud-dev.conllu + it_partut: corpus/ud-treebanks-v2.7/UD_Italian-ParTUT/it_partut-ud-dev.conllu + it_postwita: corpus/ud-treebanks-v2.7/UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu + it_twittiro: corpus/ud-treebanks-v2.7/UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu + it_vit: corpus/ud-treebanks-v2.7/UD_Italian-VIT/it_vit-ud-dev.conllu + pt_bosque: corpus/ud-treebanks-v2.7/UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu + pt_gsd: corpus/ud-treebanks-v2.7/UD_Portuguese-GSD/pt_gsd-ud-dev.conllu + ro_nonstandard: corpus/ud-treebanks-v2.7/UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu + ro_rrt: corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-dev.conllu +language_codes: +- fr_partut +- pt_gsd +- fr_ftb +- gl_ctg +- it_twittiro +- pt_bosque +- fr_gsd +- it_isdt +- ro_rrt +- es_gsd +- it_partut +- ca_ancora +- gl_treegal +- it_postwita +- ro_nonstandard +- fr_sequoia +- fr_spoken +- it_vit +- es_ancora +- fro_srcmf +language_map: + ca: ca_ancora + ca_ancora: ca_ancora + catalan: ca_ancora + es: es_gsd + es_ancora: es_ancora + es_gsd: es_gsd + fr: fr_gsd + fr_ftb: fr_ftb + fr_gsd: fr_gsd + fr_partut: fr_partut + fr_sequoia: fr_sequoia + fr_spoken: fr_spoken + french: fr_gsd + fro: fro_srcmf + fro_srcmf: fro_srcmf + galician: gl_ctg + gl: gl_ctg + gl_ctg: gl_ctg + gl_treegal: gl_treegal + it: it_isdt + it_isdt: it_isdt + it_partut: it_partut + it_postwita: it_postwita + it_twittiro: it_twittiro + it_vit: it_vit + italian: it_isdt + old french: fro_srcmf + portuguese: pt_bosque + pt: pt_bosque + pt_bosque: pt_bosque + pt_gsd: pt_gsd + ro: ro_rrt + ro_nonstandard: ro_nonstandard + ro_rrt: ro_rrt + romanian: ro_rrt + spanish: es_gsd +test_files: + ca_ancora: corpus/ud-treebanks-v2.7/UD_Catalan-AnCora/ca_ancora-ud-test.conllu + es_ancora: corpus/ud-treebanks-v2.7/UD_Spanish-AnCora/es_ancora-ud-test.conllu + es_gsd: corpus/ud-treebanks-v2.7/UD_Spanish-GSD/es_gsd-ud-test.conllu + fr_ftb: corpus/ud-treebanks-v2.7/UD_French-FTB/fr_ftb-ud-test.conllu + fr_gsd: corpus/ud-treebanks-v2.7/UD_French-GSD/fr_gsd-ud-test.conllu + fr_partut: corpus/ud-treebanks-v2.7/UD_French-ParTUT/fr_partut-ud-test.conllu + fr_sequoia: corpus/ud-treebanks-v2.7/UD_French-Sequoia/fr_sequoia-ud-test.conllu + fr_spoken: corpus/ud-treebanks-v2.7/UD_French-Spoken/fr_spoken-ud-test.conllu + fro_srcmf: corpus/ud-treebanks-v2.7/UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu + gl_ctg: corpus/ud-treebanks-v2.7/UD_Galician-CTG/gl_ctg-ud-test.conllu + gl_treegal: corpus/ud-treebanks-v2.7/UD_Galician-TreeGal/gl_treegal-ud-test.conllu + it_isdt: corpus/ud-treebanks-v2.7/UD_Italian-ISDT/it_isdt-ud-test.conllu + it_partut: corpus/ud-treebanks-v2.7/UD_Italian-ParTUT/it_partut-ud-test.conllu + it_postwita: corpus/ud-treebanks-v2.7/UD_Italian-PoSTWITA/it_postwita-ud-test.conllu + it_twittiro: corpus/ud-treebanks-v2.7/UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu + it_vit: corpus/ud-treebanks-v2.7/UD_Italian-VIT/it_vit-ud-test.conllu + pt_bosque: corpus/ud-treebanks-v2.7/UD_Portuguese-Bosque/pt_bosque-ud-test.conllu + pt_gsd: corpus/ud-treebanks-v2.7/UD_Portuguese-GSD/pt_gsd-ud-test.conllu + ro_nonstandard: corpus/ud-treebanks-v2.7/UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu + ro_rrt: corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-test.conllu +train_files: + ca_ancora: corpus/ud-treebanks-v2.7/UD_Catalan-AnCora/ca_ancora-ud-train.conllu + es_ancora: corpus/ud-treebanks-v2.7/UD_Spanish-AnCora/es_ancora-ud-train.conllu + es_gsd: corpus/ud-treebanks-v2.7/UD_Spanish-GSD/es_gsd-ud-train.conllu + fr_ftb: corpus/ud-treebanks-v2.7/UD_French-FTB/fr_ftb-ud-train.conllu + fr_gsd: corpus/ud-treebanks-v2.7/UD_French-GSD/fr_gsd-ud-train.conllu + fr_partut: corpus/ud-treebanks-v2.7/UD_French-ParTUT/fr_partut-ud-train.conllu + fr_sequoia: corpus/ud-treebanks-v2.7/UD_French-Sequoia/fr_sequoia-ud-train.conllu + fr_spoken: corpus/ud-treebanks-v2.7/UD_French-Spoken/fr_spoken-ud-train.conllu + fro_srcmf: corpus/ud-treebanks-v2.7/UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu + gl_ctg: corpus/ud-treebanks-v2.7/UD_Galician-CTG/gl_ctg-ud-train.conllu + gl_treegal: corpus/ud-treebanks-v2.7/UD_Galician-TreeGal/gl_treegal-ud-train.conllu + it_isdt: corpus/ud-treebanks-v2.7/UD_Italian-ISDT/it_isdt-ud-train.conllu + it_partut: corpus/ud-treebanks-v2.7/UD_Italian-ParTUT/it_partut-ud-train.conllu + it_postwita: corpus/ud-treebanks-v2.7/UD_Italian-PoSTWITA/it_postwita-ud-train.conllu + it_twittiro: corpus/ud-treebanks-v2.7/UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu + it_vit: corpus/ud-treebanks-v2.7/UD_Italian-VIT/it_vit-ud-train.conllu + pt_bosque: corpus/ud-treebanks-v2.7/UD_Portuguese-Bosque/pt_bosque-ud-train.conllu + pt_gsd: corpus/ud-treebanks-v2.7/UD_Portuguese-GSD/pt_gsd-ud-train.conllu + ro_nonstandard: corpus/ud-treebanks-v2.7/UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu + ro_rrt: corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-train.conllu diff --git a/scripts/train/2.7/family/sami.yaml b/scripts/train/2.7/family/sami.yaml new file mode 100644 index 000000000..21e428a4b --- /dev/null +++ b/scripts/train/2.7/family/sami.yaml @@ -0,0 +1,12 @@ +dev_files: + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-test.conllu +language_codes: +- sme_giella +language_map: + north sami: sme_giella + sme: sme_giella + sme_giella: sme_giella +test_files: + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-test.conllu +train_files: + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-train.conllu diff --git a/scripts/train/2.7/family/semitic.yaml b/scripts/train/2.7/family/semitic.yaml new file mode 100644 index 000000000..ab53f6378 --- /dev/null +++ b/scripts/train/2.7/family/semitic.yaml @@ -0,0 +1,31 @@ +dev_files: + ar_nyuad: corpus/ud-treebanks-v2.7/UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu + ar_padt: corpus/ud-treebanks-v2.7/UD_Arabic-PADT/ar_padt-ud-dev.conllu + he_htb: corpus/ud-treebanks-v2.7/UD_Hebrew-HTB/he_htb-ud-dev.conllu + mt_mudt: corpus/ud-treebanks-v2.7/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu +language_codes: +- ar_nyuad +- ar_padt +- he_htb +- mt_mudt +language_map: + ar: ar_padt + ar_nyuad: ar_nyuad + ar_padt: ar_padt + arabic: ar_padt + he: he_htb + he_htb: he_htb + hebrew: he_htb + maltese: mt_mudt + mt: mt_mudt + mt_mudt: mt_mudt +test_files: + ar_nyuad: corpus/ud-treebanks-v2.7/UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu + ar_padt: corpus/ud-treebanks-v2.7/UD_Arabic-PADT/ar_padt-ud-test.conllu + he_htb: corpus/ud-treebanks-v2.7/UD_Hebrew-HTB/he_htb-ud-test.conllu + mt_mudt: corpus/ud-treebanks-v2.7/UD_Maltese-MUDT/mt_mudt-ud-test.conllu +train_files: + ar_nyuad: corpus/ud-treebanks-v2.7/UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu + ar_padt: corpus/ud-treebanks-v2.7/UD_Arabic-PADT/ar_padt-ud-train.conllu + he_htb: corpus/ud-treebanks-v2.7/UD_Hebrew-HTB/he_htb-ud-train.conllu + mt_mudt: corpus/ud-treebanks-v2.7/UD_Maltese-MUDT/mt_mudt-ud-train.conllu diff --git a/scripts/train/2.7/family/sign language.yaml b/scripts/train/2.7/family/sign language.yaml new file mode 100644 index 000000000..2e406e20f --- /dev/null +++ b/scripts/train/2.7/family/sign language.yaml @@ -0,0 +1,12 @@ +dev_files: + swl_sslc: corpus/ud-treebanks-v2.7/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu +language_codes: +- swl_sslc +language_map: + swedish sign language: swl_sslc + swl: swl_sslc + swl_sslc: swl_sslc +test_files: + swl_sslc: corpus/ud-treebanks-v2.7/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu +train_files: + swl_sslc: corpus/ud-treebanks-v2.7/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu diff --git a/scripts/train/2.7/family/sino-tibetan.yaml b/scripts/train/2.7/family/sino-tibetan.yaml new file mode 100644 index 000000000..85027558b --- /dev/null +++ b/scripts/train/2.7/family/sino-tibetan.yaml @@ -0,0 +1,24 @@ +dev_files: + lzh_kyoto: corpus/ud-treebanks-v2.7/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu + zh_gsd: corpus/ud-treebanks-v2.7/UD_Chinese-GSD/zh_gsd-ud-dev.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.7/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu +language_codes: +- zh_gsdsimp +- lzh_kyoto +- zh_gsd +language_map: + chinese: zh_gsdsimp + classical chinese: lzh_kyoto + lzh: lzh_kyoto + lzh_kyoto: lzh_kyoto + zh: zh_gsdsimp + zh_gsd: zh_gsd + zh_gsdsimp: zh_gsdsimp +test_files: + lzh_kyoto: corpus/ud-treebanks-v2.7/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu + zh_gsd: corpus/ud-treebanks-v2.7/UD_Chinese-GSD/zh_gsd-ud-test.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.7/UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu +train_files: + lzh_kyoto: corpus/ud-treebanks-v2.7/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu + zh_gsd: corpus/ud-treebanks-v2.7/UD_Chinese-GSD/zh_gsd-ud-train.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.7/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu diff --git a/scripts/train/2.7/family/slavic.yaml b/scripts/train/2.7/family/slavic.yaml new file mode 100644 index 000000000..63bc14746 --- /dev/null +++ b/scripts/train/2.7/family/slavic.yaml @@ -0,0 +1,124 @@ +dev_files: + be_hse: corpus/ud-treebanks-v2.7/UD_Belarusian-HSE/be_hse-ud-dev.conllu + bg_btb: corpus/ud-treebanks-v2.7/UD_Bulgarian-BTB/bg_btb-ud-dev.conllu + cs_cac: corpus/ud-treebanks-v2.7/UD_Czech-CAC/cs_cac-ud-dev.conllu + cs_cltt: corpus/ud-treebanks-v2.7/UD_Czech-CLTT/cs_cltt-ud-dev.conllu + cs_fictree: corpus/ud-treebanks-v2.7/UD_Czech-FicTree/cs_fictree-ud-dev.conllu + cs_pdt: corpus/ud-treebanks-v2.7/UD_Czech-PDT/cs_pdt-ud-dev.conllu + cu_proiel: corpus/ud-treebanks-v2.7/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu + hr_set: corpus/ud-treebanks-v2.7/UD_Croatian-SET/hr_set-ud-dev.conllu + hsb_ufal: corpus/ud-treebanks-v2.7/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu + pl_lfg: corpus/ud-treebanks-v2.7/UD_Polish-LFG/pl_lfg-ud-dev.conllu + pl_pdb: corpus/ud-treebanks-v2.7/UD_Polish-PDB/pl_pdb-ud-dev.conllu + ru_gsd: corpus/ud-treebanks-v2.7/UD_Russian-GSD/ru_gsd-ud-dev.conllu + ru_syntagrus: corpus/ud-treebanks-v2.7/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu + ru_taiga: corpus/ud-treebanks-v2.7/UD_Russian-Taiga/ru_taiga-ud-dev.conllu + sk_snk: corpus/ud-treebanks-v2.7/UD_Slovak-SNK/sk_snk-ud-dev.conllu + sl_ssj: corpus/ud-treebanks-v2.7/UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu + sl_sst: corpus/ud-treebanks-v2.7/UD_Slovenian-SST/sl_sst-ud-test.conllu + sr_set: corpus/ud-treebanks-v2.7/UD_Serbian-SET/sr_set-ud-dev.conllu + uk_iu: corpus/ud-treebanks-v2.7/UD_Ukrainian-IU/uk_iu-ud-dev.conllu +language_codes: +- hr_set +- pl_lfg +- sl_sst +- cu_proiel +- be_hse +- uk_iu +- ru_gsd +- ru_taiga +- cs_pdt +- bg_btb +- sr_set +- ru_syntagrus +- cs_fictree +- hsb_ufal +- cs_cac +- sl_ssj +- cs_cltt +- pl_pdb +- sk_snk +language_map: + be: be_hse + be_hse: be_hse + belarusian: be_hse + bg: bg_btb + bg_btb: bg_btb + bulgarian: bg_btb + croatian: hr_set + cs: cs_pdt + cs_cac: cs_cac + cs_cltt: cs_cltt + cs_fictree: cs_fictree + cs_pdt: cs_pdt + cu: cu_proiel + cu_proiel: cu_proiel + czech: cs_pdt + hr: hr_set + hr_set: hr_set + hsb: hsb_ufal + hsb_ufal: hsb_ufal + old church slavonic: cu_proiel + pl: pl_pdb + pl_lfg: pl_lfg + pl_pdb: pl_pdb + polish: pl_pdb + ru: ru_syntagrus + ru_gsd: ru_gsd + ru_syntagrus: ru_syntagrus + ru_taiga: ru_taiga + russian: ru_syntagrus + serbian: sr_set + sk: sk_snk + sk_snk: sk_snk + sl: sl_ssj + sl_ssj: sl_ssj + sl_sst: sl_sst + slovak: sk_snk + slovenian: sl_ssj + sr: sr_set + sr_set: sr_set + uk: uk_iu + uk_iu: uk_iu + ukrainian: uk_iu + upper sorbian: hsb_ufal +test_files: + be_hse: corpus/ud-treebanks-v2.7/UD_Belarusian-HSE/be_hse-ud-test.conllu + bg_btb: corpus/ud-treebanks-v2.7/UD_Bulgarian-BTB/bg_btb-ud-test.conllu + cs_cac: corpus/ud-treebanks-v2.7/UD_Czech-CAC/cs_cac-ud-test.conllu + cs_cltt: corpus/ud-treebanks-v2.7/UD_Czech-CLTT/cs_cltt-ud-test.conllu + cs_fictree: corpus/ud-treebanks-v2.7/UD_Czech-FicTree/cs_fictree-ud-test.conllu + cs_pdt: corpus/ud-treebanks-v2.7/UD_Czech-PDT/cs_pdt-ud-test.conllu + cu_proiel: corpus/ud-treebanks-v2.7/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu + hr_set: corpus/ud-treebanks-v2.7/UD_Croatian-SET/hr_set-ud-test.conllu + hsb_ufal: corpus/ud-treebanks-v2.7/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu + pl_lfg: corpus/ud-treebanks-v2.7/UD_Polish-LFG/pl_lfg-ud-test.conllu + pl_pdb: corpus/ud-treebanks-v2.7/UD_Polish-PDB/pl_pdb-ud-test.conllu + ru_gsd: corpus/ud-treebanks-v2.7/UD_Russian-GSD/ru_gsd-ud-test.conllu + ru_syntagrus: corpus/ud-treebanks-v2.7/UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu + ru_taiga: corpus/ud-treebanks-v2.7/UD_Russian-Taiga/ru_taiga-ud-test.conllu + sk_snk: corpus/ud-treebanks-v2.7/UD_Slovak-SNK/sk_snk-ud-test.conllu + sl_ssj: corpus/ud-treebanks-v2.7/UD_Slovenian-SSJ/sl_ssj-ud-test.conllu + sl_sst: corpus/ud-treebanks-v2.7/UD_Slovenian-SST/sl_sst-ud-test.conllu + sr_set: corpus/ud-treebanks-v2.7/UD_Serbian-SET/sr_set-ud-test.conllu + uk_iu: corpus/ud-treebanks-v2.7/UD_Ukrainian-IU/uk_iu-ud-test.conllu +train_files: + be_hse: corpus/ud-treebanks-v2.7/UD_Belarusian-HSE/be_hse-ud-train.conllu + bg_btb: corpus/ud-treebanks-v2.7/UD_Bulgarian-BTB/bg_btb-ud-train.conllu + cs_cac: corpus/ud-treebanks-v2.7/UD_Czech-CAC/cs_cac-ud-train.conllu + cs_cltt: corpus/ud-treebanks-v2.7/UD_Czech-CLTT/cs_cltt-ud-train.conllu + cs_fictree: corpus/ud-treebanks-v2.7/UD_Czech-FicTree/cs_fictree-ud-train.conllu + cs_pdt: corpus/ud-treebanks-v2.7/UD_Czech-PDT/cs_pdt-ud-train.conllu + cu_proiel: corpus/ud-treebanks-v2.7/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu + hr_set: corpus/ud-treebanks-v2.7/UD_Croatian-SET/hr_set-ud-train.conllu + hsb_ufal: corpus/ud-treebanks-v2.7/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu + pl_lfg: corpus/ud-treebanks-v2.7/UD_Polish-LFG/pl_lfg-ud-train.conllu + pl_pdb: corpus/ud-treebanks-v2.7/UD_Polish-PDB/pl_pdb-ud-train.conllu + ru_gsd: corpus/ud-treebanks-v2.7/UD_Russian-GSD/ru_gsd-ud-train.conllu + ru_syntagrus: corpus/ud-treebanks-v2.7/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu + ru_taiga: corpus/ud-treebanks-v2.7/UD_Russian-Taiga/ru_taiga-ud-train.conllu + sk_snk: corpus/ud-treebanks-v2.7/UD_Slovak-SNK/sk_snk-ud-train.conllu + sl_ssj: corpus/ud-treebanks-v2.7/UD_Slovenian-SSJ/sl_ssj-ud-train.conllu + sl_sst: corpus/ud-treebanks-v2.7/UD_Slovenian-SST/sl_sst-ud-train.conllu + sr_set: corpus/ud-treebanks-v2.7/UD_Serbian-SET/sr_set-ud-train.conllu + uk_iu: corpus/ud-treebanks-v2.7/UD_Ukrainian-IU/uk_iu-ud-train.conllu diff --git a/scripts/train/2.7/family/south central.yaml b/scripts/train/2.7/family/south central.yaml new file mode 100644 index 000000000..184544688 --- /dev/null +++ b/scripts/train/2.7/family/south central.yaml @@ -0,0 +1,12 @@ +dev_files: + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-dev.conllu +language_codes: +- te_mtg +language_map: + te: te_mtg + te_mtg: te_mtg + telugu: te_mtg +test_files: + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-test.conllu +train_files: + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-train.conllu diff --git a/scripts/train/2.7/family/southeastern.yaml b/scripts/train/2.7/family/southeastern.yaml new file mode 100644 index 000000000..070d6e58e --- /dev/null +++ b/scripts/train/2.7/family/southeastern.yaml @@ -0,0 +1,12 @@ +dev_files: + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-dev.conllu +language_codes: +- ug_udt +language_map: + ug: ug_udt + ug_udt: ug_udt + uyghur: ug_udt +test_files: + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-test.conllu +train_files: + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-train.conllu diff --git a/scripts/train/2.7/family/southern.yaml b/scripts/train/2.7/family/southern.yaml new file mode 100644 index 000000000..3f04b99e0 --- /dev/null +++ b/scripts/train/2.7/family/southern.yaml @@ -0,0 +1,12 @@ +dev_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-dev.conllu +language_codes: +- ta_ttb +language_map: + ta: ta_ttb + ta_ttb: ta_ttb + tamil: ta_ttb +test_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-test.conllu +train_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-train.conllu diff --git a/scripts/train/2.7/family/southwestern.yaml b/scripts/train/2.7/family/southwestern.yaml new file mode 100644 index 000000000..02b0d40ab --- /dev/null +++ b/scripts/train/2.7/family/southwestern.yaml @@ -0,0 +1,17 @@ +dev_files: + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-dev.conllu + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-dev.conllu +language_codes: +- tr_imst +- tr_boun +language_map: + tr: tr_imst + tr_boun: tr_boun + tr_imst: tr_imst + turkish: tr_imst +test_files: + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-test.conllu + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-test.conllu +train_files: + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-train.conllu + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-train.conllu diff --git a/scripts/train/2.7/family/turkic.yaml b/scripts/train/2.7/family/turkic.yaml new file mode 100644 index 000000000..8dc56b010 --- /dev/null +++ b/scripts/train/2.7/family/turkic.yaml @@ -0,0 +1,31 @@ +dev_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-test.conllu + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-dev.conllu + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-dev.conllu + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-dev.conllu +language_codes: +- kk_ktb +- tr_imst +- tr_boun +- ug_udt +language_map: + kazakh: kk_ktb + kk: kk_ktb + kk_ktb: kk_ktb + tr: tr_imst + tr_boun: tr_boun + tr_imst: tr_imst + turkish: tr_imst + ug: ug_udt + ug_udt: ug_udt + uyghur: ug_udt +test_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-test.conllu + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-test.conllu + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-test.conllu + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-test.conllu +train_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-train.conllu + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-train.conllu + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-train.conllu + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-train.conllu diff --git a/scripts/train/2.7/family/ugric.yaml b/scripts/train/2.7/family/ugric.yaml new file mode 100644 index 000000000..9dcf11671 --- /dev/null +++ b/scripts/train/2.7/family/ugric.yaml @@ -0,0 +1,12 @@ +dev_files: + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu +language_codes: +- hu_szeged +language_map: + hu: hu_szeged + hu_szeged: hu_szeged + hungarian: hu_szeged +test_files: + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-test.conllu +train_files: + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu diff --git a/scripts/train/2.7/family/uralic.yaml b/scripts/train/2.7/family/uralic.yaml new file mode 100644 index 000000000..11b7b212e --- /dev/null +++ b/scripts/train/2.7/family/uralic.yaml @@ -0,0 +1,50 @@ +dev_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-dev.conllu + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-dev.conllu + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-dev.conllu + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-dev.conllu + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-test.conllu +language_codes: +- et_edt +- fi_ftb +- sme_giella +- et_ewt +- hu_szeged +- olo_kkpp +- fi_tdt +language_map: + estonian: et_edt + et: et_edt + et_edt: et_edt + et_ewt: et_ewt + fi: fi_tdt + fi_ftb: fi_ftb + fi_tdt: fi_tdt + finnish: fi_tdt + hu: hu_szeged + hu_szeged: hu_szeged + hungarian: hu_szeged + livvi: olo_kkpp + north sami: sme_giella + olo: olo_kkpp + olo_kkpp: olo_kkpp + sme: sme_giella + sme_giella: sme_giella +test_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-test.conllu + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-test.conllu + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-test.conllu + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-test.conllu + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-test.conllu + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-test.conllu +train_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-train.conllu + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-train.conllu + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-train.conllu + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-train.conllu + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-train.conllu diff --git a/scripts/train/2.7/family/viet-muong.yaml b/scripts/train/2.7/family/viet-muong.yaml new file mode 100644 index 000000000..d2f10054f --- /dev/null +++ b/scripts/train/2.7/family/viet-muong.yaml @@ -0,0 +1,12 @@ +dev_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu +language_codes: +- vi_vtb +language_map: + vi: vi_vtb + vi_vtb: vi_vtb + vietnamese: vi_vtb +test_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-test.conllu +train_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu diff --git a/scripts/train/2.7/language/af.yaml b/scripts/train/2.7/language/af.yaml new file mode 100644 index 000000000..8ec9405bf --- /dev/null +++ b/scripts/train/2.7/language/af.yaml @@ -0,0 +1,12 @@ +dev_files: + af_afribooms: corpus/ud-treebanks-v2.7/UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu +language_codes: +- af_afribooms +language_map: + af: af_afribooms + af_afribooms: af_afribooms + afrikaans: af_afribooms +test_files: + af_afribooms: corpus/ud-treebanks-v2.7/UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu +train_files: + af_afribooms: corpus/ud-treebanks-v2.7/UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu diff --git a/scripts/train/2.7/language/ar.yaml b/scripts/train/2.7/language/ar.yaml new file mode 100644 index 000000000..bd4bbd310 --- /dev/null +++ b/scripts/train/2.7/language/ar.yaml @@ -0,0 +1,17 @@ +dev_files: + ar_nyuad: corpus/ud-treebanks-v2.7/UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu + ar_padt: corpus/ud-treebanks-v2.7/UD_Arabic-PADT/ar_padt-ud-dev.conllu +language_codes: +- ar_nyuad +- ar_padt +language_map: + ar: ar_padt + ar_nyuad: ar_nyuad + ar_padt: ar_padt + arabic: ar_padt +test_files: + ar_nyuad: corpus/ud-treebanks-v2.7/UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu + ar_padt: corpus/ud-treebanks-v2.7/UD_Arabic-PADT/ar_padt-ud-test.conllu +train_files: + ar_nyuad: corpus/ud-treebanks-v2.7/UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu + ar_padt: corpus/ud-treebanks-v2.7/UD_Arabic-PADT/ar_padt-ud-train.conllu diff --git a/scripts/train/2.7/language/be.yaml b/scripts/train/2.7/language/be.yaml new file mode 100644 index 000000000..164bb882c --- /dev/null +++ b/scripts/train/2.7/language/be.yaml @@ -0,0 +1,12 @@ +dev_files: + be_hse: corpus/ud-treebanks-v2.7/UD_Belarusian-HSE/be_hse-ud-dev.conllu +language_codes: +- be_hse +language_map: + be: be_hse + be_hse: be_hse + belarusian: be_hse +test_files: + be_hse: corpus/ud-treebanks-v2.7/UD_Belarusian-HSE/be_hse-ud-test.conllu +train_files: + be_hse: corpus/ud-treebanks-v2.7/UD_Belarusian-HSE/be_hse-ud-train.conllu diff --git a/scripts/train/2.7/language/bg.yaml b/scripts/train/2.7/language/bg.yaml new file mode 100644 index 000000000..1b5979f0a --- /dev/null +++ b/scripts/train/2.7/language/bg.yaml @@ -0,0 +1,12 @@ +dev_files: + bg_btb: corpus/ud-treebanks-v2.7/UD_Bulgarian-BTB/bg_btb-ud-dev.conllu +language_codes: +- bg_btb +language_map: + bg: bg_btb + bg_btb: bg_btb + bulgarian: bg_btb +test_files: + bg_btb: corpus/ud-treebanks-v2.7/UD_Bulgarian-BTB/bg_btb-ud-test.conllu +train_files: + bg_btb: corpus/ud-treebanks-v2.7/UD_Bulgarian-BTB/bg_btb-ud-train.conllu diff --git a/scripts/train/2.7/language/bxr.yaml b/scripts/train/2.7/language/bxr.yaml new file mode 100644 index 000000000..70260ceb9 --- /dev/null +++ b/scripts/train/2.7/language/bxr.yaml @@ -0,0 +1,12 @@ +dev_files: + bxr_bdt: corpus/ud-treebanks-v2.7/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +language_codes: +- bxr_bdt +language_map: + buryat: bxr_bdt + bxr: bxr_bdt + bxr_bdt: bxr_bdt +test_files: + bxr_bdt: corpus/ud-treebanks-v2.7/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +train_files: + bxr_bdt: corpus/ud-treebanks-v2.7/UD_Buryat-BDT/bxr_bdt-ud-train.conllu diff --git a/scripts/train/2.7/language/ca.yaml b/scripts/train/2.7/language/ca.yaml new file mode 100644 index 000000000..6f8e6e124 --- /dev/null +++ b/scripts/train/2.7/language/ca.yaml @@ -0,0 +1,12 @@ +dev_files: + ca_ancora: corpus/ud-treebanks-v2.7/UD_Catalan-AnCora/ca_ancora-ud-dev.conllu +language_codes: +- ca_ancora +language_map: + ca: ca_ancora + ca_ancora: ca_ancora + catalan: ca_ancora +test_files: + ca_ancora: corpus/ud-treebanks-v2.7/UD_Catalan-AnCora/ca_ancora-ud-test.conllu +train_files: + ca_ancora: corpus/ud-treebanks-v2.7/UD_Catalan-AnCora/ca_ancora-ud-train.conllu diff --git a/scripts/train/2.7/language/cop.yaml b/scripts/train/2.7/language/cop.yaml new file mode 100644 index 000000000..1c15ecbf2 --- /dev/null +++ b/scripts/train/2.7/language/cop.yaml @@ -0,0 +1,12 @@ +dev_files: + cop_scriptorium: corpus/ud-treebanks-v2.7/UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu +language_codes: +- cop_scriptorium +language_map: + cop: cop_scriptorium + cop_scriptorium: cop_scriptorium + coptic: cop_scriptorium +test_files: + cop_scriptorium: corpus/ud-treebanks-v2.7/UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu +train_files: + cop_scriptorium: corpus/ud-treebanks-v2.7/UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu diff --git a/scripts/train/2.7/language/cs.yaml b/scripts/train/2.7/language/cs.yaml new file mode 100644 index 000000000..abbbc312a --- /dev/null +++ b/scripts/train/2.7/language/cs.yaml @@ -0,0 +1,27 @@ +dev_files: + cs_cac: corpus/ud-treebanks-v2.7/UD_Czech-CAC/cs_cac-ud-dev.conllu + cs_cltt: corpus/ud-treebanks-v2.7/UD_Czech-CLTT/cs_cltt-ud-dev.conllu + cs_fictree: corpus/ud-treebanks-v2.7/UD_Czech-FicTree/cs_fictree-ud-dev.conllu + cs_pdt: corpus/ud-treebanks-v2.7/UD_Czech-PDT/cs_pdt-ud-dev.conllu +language_codes: +- cs_fictree +- cs_cac +- cs_pdt +- cs_cltt +language_map: + cs: cs_pdt + cs_cac: cs_cac + cs_cltt: cs_cltt + cs_fictree: cs_fictree + cs_pdt: cs_pdt + czech: cs_pdt +test_files: + cs_cac: corpus/ud-treebanks-v2.7/UD_Czech-CAC/cs_cac-ud-test.conllu + cs_cltt: corpus/ud-treebanks-v2.7/UD_Czech-CLTT/cs_cltt-ud-test.conllu + cs_fictree: corpus/ud-treebanks-v2.7/UD_Czech-FicTree/cs_fictree-ud-test.conllu + cs_pdt: corpus/ud-treebanks-v2.7/UD_Czech-PDT/cs_pdt-ud-test.conllu +train_files: + cs_cac: corpus/ud-treebanks-v2.7/UD_Czech-CAC/cs_cac-ud-train.conllu + cs_cltt: corpus/ud-treebanks-v2.7/UD_Czech-CLTT/cs_cltt-ud-train.conllu + cs_fictree: corpus/ud-treebanks-v2.7/UD_Czech-FicTree/cs_fictree-ud-train.conllu + cs_pdt: corpus/ud-treebanks-v2.7/UD_Czech-PDT/cs_pdt-ud-train.conllu diff --git a/scripts/train/2.7/language/cu.yaml b/scripts/train/2.7/language/cu.yaml new file mode 100644 index 000000000..eef05c14c --- /dev/null +++ b/scripts/train/2.7/language/cu.yaml @@ -0,0 +1,12 @@ +dev_files: + cu_proiel: corpus/ud-treebanks-v2.7/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu +language_codes: +- cu_proiel +language_map: + cu: cu_proiel + cu_proiel: cu_proiel + old church slavonic: cu_proiel +test_files: + cu_proiel: corpus/ud-treebanks-v2.7/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu +train_files: + cu_proiel: corpus/ud-treebanks-v2.7/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu diff --git a/scripts/train/2.7/language/cy.yaml b/scripts/train/2.7/language/cy.yaml new file mode 100644 index 000000000..ba3e31e6d --- /dev/null +++ b/scripts/train/2.7/language/cy.yaml @@ -0,0 +1,12 @@ +dev_files: + cy_ccg: corpus/ud-treebanks-v2.7/UD_Welsh-CCG/cy_ccg-ud-test.conllu +language_codes: +- cy_ccg +language_map: + cy: cy_ccg + cy_ccg: cy_ccg + welsh: cy_ccg +test_files: + cy_ccg: corpus/ud-treebanks-v2.7/UD_Welsh-CCG/cy_ccg-ud-test.conllu +train_files: + cy_ccg: corpus/ud-treebanks-v2.7/UD_Welsh-CCG/cy_ccg-ud-train.conllu diff --git a/scripts/train/2.7/language/da.yaml b/scripts/train/2.7/language/da.yaml new file mode 100644 index 000000000..df3045898 --- /dev/null +++ b/scripts/train/2.7/language/da.yaml @@ -0,0 +1,12 @@ +dev_files: + da_ddt: corpus/ud-treebanks-v2.7/UD_Danish-DDT/da_ddt-ud-dev.conllu +language_codes: +- da_ddt +language_map: + da: da_ddt + da_ddt: da_ddt + danish: da_ddt +test_files: + da_ddt: corpus/ud-treebanks-v2.7/UD_Danish-DDT/da_ddt-ud-test.conllu +train_files: + da_ddt: corpus/ud-treebanks-v2.7/UD_Danish-DDT/da_ddt-ud-train.conllu diff --git a/scripts/train/2.7/language/de.yaml b/scripts/train/2.7/language/de.yaml new file mode 100644 index 000000000..d1490f089 --- /dev/null +++ b/scripts/train/2.7/language/de.yaml @@ -0,0 +1,17 @@ +dev_files: + de_gsd: corpus/ud-treebanks-v2.7/UD_German-GSD/de_gsd-ud-dev.conllu + de_hdt: corpus/ud-treebanks-v2.7/UD_German-HDT/de_hdt-ud-dev.conllu +language_codes: +- de_gsd +- de_hdt +language_map: + de: de_hdt + de_gsd: de_gsd + de_hdt: de_hdt + german: de_hdt +test_files: + de_gsd: corpus/ud-treebanks-v2.7/UD_German-GSD/de_gsd-ud-test.conllu + de_hdt: corpus/ud-treebanks-v2.7/UD_German-HDT/de_hdt-ud-test.conllu +train_files: + de_gsd: corpus/ud-treebanks-v2.7/UD_German-GSD/de_gsd-ud-train.conllu + de_hdt: corpus/ud-treebanks-v2.7/UD_German-HDT/de_hdt-ud-train.conllu diff --git a/scripts/train/2.7/language/el.yaml b/scripts/train/2.7/language/el.yaml new file mode 100644 index 000000000..fffdd95f6 --- /dev/null +++ b/scripts/train/2.7/language/el.yaml @@ -0,0 +1,12 @@ +dev_files: + el_gdt: corpus/ud-treebanks-v2.7/UD_Greek-GDT/el_gdt-ud-dev.conllu +language_codes: +- el_gdt +language_map: + el: el_gdt + el_gdt: el_gdt + greek: el_gdt +test_files: + el_gdt: corpus/ud-treebanks-v2.7/UD_Greek-GDT/el_gdt-ud-test.conllu +train_files: + el_gdt: corpus/ud-treebanks-v2.7/UD_Greek-GDT/el_gdt-ud-train.conllu diff --git a/scripts/train/2.7/language/en.yaml b/scripts/train/2.7/language/en.yaml new file mode 100644 index 000000000..08c1e40bc --- /dev/null +++ b/scripts/train/2.7/language/en.yaml @@ -0,0 +1,37 @@ +dev_files: + en_esl: corpus/ud-treebanks-v2.7/UD_English-ESL/en_esl-ud-dev.conllu + en_ewt: corpus/ud-treebanks-v2.7/UD_English-EWT/en_ewt-ud-dev.conllu + en_gum: corpus/ud-treebanks-v2.7/UD_English-GUM/en_gum-ud-dev.conllu + en_gumreddit: corpus/ud-treebanks-v2.7/UD_English-GUMReddit/en_gumreddit-ud-dev.conllu + en_lines: corpus/ud-treebanks-v2.7/UD_English-LinES/en_lines-ud-dev.conllu + en_partut: corpus/ud-treebanks-v2.7/UD_English-ParTUT/en_partut-ud-dev.conllu +language_codes: +- en_partut +- en_lines +- en_ewt +- en_gum +- en_gumreddit +- en_esl +language_map: + en: en_ewt + en_esl: en_esl + en_ewt: en_ewt + en_gum: en_gum + en_gumreddit: en_gumreddit + en_lines: en_lines + en_partut: en_partut + english: en_ewt +test_files: + en_esl: corpus/ud-treebanks-v2.7/UD_English-ESL/en_esl-ud-test.conllu + en_ewt: corpus/ud-treebanks-v2.7/UD_English-EWT/en_ewt-ud-test.conllu + en_gum: corpus/ud-treebanks-v2.7/UD_English-GUM/en_gum-ud-test.conllu + en_gumreddit: corpus/ud-treebanks-v2.7/UD_English-GUMReddit/en_gumreddit-ud-test.conllu + en_lines: corpus/ud-treebanks-v2.7/UD_English-LinES/en_lines-ud-test.conllu + en_partut: corpus/ud-treebanks-v2.7/UD_English-ParTUT/en_partut-ud-test.conllu +train_files: + en_esl: corpus/ud-treebanks-v2.7/UD_English-ESL/en_esl-ud-train.conllu + en_ewt: corpus/ud-treebanks-v2.7/UD_English-EWT/en_ewt-ud-train.conllu + en_gum: corpus/ud-treebanks-v2.7/UD_English-GUM/en_gum-ud-train.conllu + en_gumreddit: corpus/ud-treebanks-v2.7/UD_English-GUMReddit/en_gumreddit-ud-train.conllu + en_lines: corpus/ud-treebanks-v2.7/UD_English-LinES/en_lines-ud-train.conllu + en_partut: corpus/ud-treebanks-v2.7/UD_English-ParTUT/en_partut-ud-train.conllu diff --git a/scripts/train/2.7/language/es.yaml b/scripts/train/2.7/language/es.yaml new file mode 100644 index 000000000..e9c833c7f --- /dev/null +++ b/scripts/train/2.7/language/es.yaml @@ -0,0 +1,17 @@ +dev_files: + es_ancora: corpus/ud-treebanks-v2.7/UD_Spanish-AnCora/es_ancora-ud-dev.conllu + es_gsd: corpus/ud-treebanks-v2.7/UD_Spanish-GSD/es_gsd-ud-dev.conllu +language_codes: +- es_ancora +- es_gsd +language_map: + es: es_gsd + es_ancora: es_ancora + es_gsd: es_gsd + spanish: es_gsd +test_files: + es_ancora: corpus/ud-treebanks-v2.7/UD_Spanish-AnCora/es_ancora-ud-test.conllu + es_gsd: corpus/ud-treebanks-v2.7/UD_Spanish-GSD/es_gsd-ud-test.conllu +train_files: + es_ancora: corpus/ud-treebanks-v2.7/UD_Spanish-AnCora/es_ancora-ud-train.conllu + es_gsd: corpus/ud-treebanks-v2.7/UD_Spanish-GSD/es_gsd-ud-train.conllu diff --git a/scripts/train/2.7/language/et.yaml b/scripts/train/2.7/language/et.yaml new file mode 100644 index 000000000..d55d7d92e --- /dev/null +++ b/scripts/train/2.7/language/et.yaml @@ -0,0 +1,17 @@ +dev_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-dev.conllu + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-dev.conllu +language_codes: +- et_edt +- et_ewt +language_map: + estonian: et_edt + et: et_edt + et_edt: et_edt + et_ewt: et_ewt +test_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-test.conllu + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-test.conllu +train_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-train.conllu + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-train.conllu diff --git a/scripts/train/2.7/language/eu.yaml b/scripts/train/2.7/language/eu.yaml new file mode 100644 index 000000000..eeaffa0b6 --- /dev/null +++ b/scripts/train/2.7/language/eu.yaml @@ -0,0 +1,12 @@ +dev_files: + eu_bdt: corpus/ud-treebanks-v2.7/UD_Basque-BDT/eu_bdt-ud-dev.conllu +language_codes: +- eu_bdt +language_map: + basque: eu_bdt + eu: eu_bdt + eu_bdt: eu_bdt +test_files: + eu_bdt: corpus/ud-treebanks-v2.7/UD_Basque-BDT/eu_bdt-ud-test.conllu +train_files: + eu_bdt: corpus/ud-treebanks-v2.7/UD_Basque-BDT/eu_bdt-ud-train.conllu diff --git a/scripts/train/2.7/language/fa.yaml b/scripts/train/2.7/language/fa.yaml new file mode 100644 index 000000000..32d247856 --- /dev/null +++ b/scripts/train/2.7/language/fa.yaml @@ -0,0 +1,17 @@ +dev_files: + fa_perdt: corpus/ud-treebanks-v2.7/UD_Persian-PerDT/fa_perdt-ud-dev.conllu + fa_seraji: corpus/ud-treebanks-v2.7/UD_Persian-Seraji/fa_seraji-ud-dev.conllu +language_codes: +- fa_seraji +- fa_perdt +language_map: + fa: fa_seraji + fa_perdt: fa_perdt + fa_seraji: fa_seraji + persian: fa_seraji +test_files: + fa_perdt: corpus/ud-treebanks-v2.7/UD_Persian-PerDT/fa_perdt-ud-test.conllu + fa_seraji: corpus/ud-treebanks-v2.7/UD_Persian-Seraji/fa_seraji-ud-test.conllu +train_files: + fa_perdt: corpus/ud-treebanks-v2.7/UD_Persian-PerDT/fa_perdt-ud-train.conllu + fa_seraji: corpus/ud-treebanks-v2.7/UD_Persian-Seraji/fa_seraji-ud-train.conllu diff --git a/scripts/train/2.7/language/fi.yaml b/scripts/train/2.7/language/fi.yaml new file mode 100644 index 000000000..69f0ad6e1 --- /dev/null +++ b/scripts/train/2.7/language/fi.yaml @@ -0,0 +1,17 @@ +dev_files: + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-dev.conllu + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-dev.conllu +language_codes: +- fi_ftb +- fi_tdt +language_map: + fi: fi_tdt + fi_ftb: fi_ftb + fi_tdt: fi_tdt + finnish: fi_tdt +test_files: + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-test.conllu + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-test.conllu +train_files: + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-train.conllu + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-train.conllu diff --git a/scripts/train/2.7/language/fo.yaml b/scripts/train/2.7/language/fo.yaml new file mode 100644 index 000000000..d22755ca5 --- /dev/null +++ b/scripts/train/2.7/language/fo.yaml @@ -0,0 +1,13 @@ +dev_files: + fo_farpahc: corpus/ud-treebanks-v2.7/UD_Faroese-FarPaHC/fo_farpahc-ud-dev.conllu +language_codes: +- fo_oft +- fo_farpahc +language_map: + faroese: fo_oft + fo: fo_oft + fo_farpahc: fo_farpahc +test_files: + fo_farpahc: corpus/ud-treebanks-v2.7/UD_Faroese-FarPaHC/fo_farpahc-ud-test.conllu +train_files: + fo_farpahc: corpus/ud-treebanks-v2.7/UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu diff --git a/scripts/train/2.7/language/fr.yaml b/scripts/train/2.7/language/fr.yaml new file mode 100644 index 000000000..c26fc3337 --- /dev/null +++ b/scripts/train/2.7/language/fr.yaml @@ -0,0 +1,32 @@ +dev_files: + fr_ftb: corpus/ud-treebanks-v2.7/UD_French-FTB/fr_ftb-ud-dev.conllu + fr_gsd: corpus/ud-treebanks-v2.7/UD_French-GSD/fr_gsd-ud-dev.conllu + fr_partut: corpus/ud-treebanks-v2.7/UD_French-ParTUT/fr_partut-ud-dev.conllu + fr_sequoia: corpus/ud-treebanks-v2.7/UD_French-Sequoia/fr_sequoia-ud-dev.conllu + fr_spoken: corpus/ud-treebanks-v2.7/UD_French-Spoken/fr_spoken-ud-dev.conllu +language_codes: +- fr_partut +- fr_gsd +- fr_sequoia +- fr_spoken +- fr_ftb +language_map: + fr: fr_gsd + fr_ftb: fr_ftb + fr_gsd: fr_gsd + fr_partut: fr_partut + fr_sequoia: fr_sequoia + fr_spoken: fr_spoken + french: fr_gsd +test_files: + fr_ftb: corpus/ud-treebanks-v2.7/UD_French-FTB/fr_ftb-ud-test.conllu + fr_gsd: corpus/ud-treebanks-v2.7/UD_French-GSD/fr_gsd-ud-test.conllu + fr_partut: corpus/ud-treebanks-v2.7/UD_French-ParTUT/fr_partut-ud-test.conllu + fr_sequoia: corpus/ud-treebanks-v2.7/UD_French-Sequoia/fr_sequoia-ud-test.conllu + fr_spoken: corpus/ud-treebanks-v2.7/UD_French-Spoken/fr_spoken-ud-test.conllu +train_files: + fr_ftb: corpus/ud-treebanks-v2.7/UD_French-FTB/fr_ftb-ud-train.conllu + fr_gsd: corpus/ud-treebanks-v2.7/UD_French-GSD/fr_gsd-ud-train.conllu + fr_partut: corpus/ud-treebanks-v2.7/UD_French-ParTUT/fr_partut-ud-train.conllu + fr_sequoia: corpus/ud-treebanks-v2.7/UD_French-Sequoia/fr_sequoia-ud-train.conllu + fr_spoken: corpus/ud-treebanks-v2.7/UD_French-Spoken/fr_spoken-ud-train.conllu diff --git a/scripts/train/2.7/language/fro.yaml b/scripts/train/2.7/language/fro.yaml new file mode 100644 index 000000000..63cc54f67 --- /dev/null +++ b/scripts/train/2.7/language/fro.yaml @@ -0,0 +1,12 @@ +dev_files: + fro_srcmf: corpus/ud-treebanks-v2.7/UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu +language_codes: +- fro_srcmf +language_map: + fro: fro_srcmf + fro_srcmf: fro_srcmf + old french: fro_srcmf +test_files: + fro_srcmf: corpus/ud-treebanks-v2.7/UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu +train_files: + fro_srcmf: corpus/ud-treebanks-v2.7/UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu diff --git a/scripts/train/2.7/language/ga.yaml b/scripts/train/2.7/language/ga.yaml new file mode 100644 index 000000000..3af88676a --- /dev/null +++ b/scripts/train/2.7/language/ga.yaml @@ -0,0 +1,12 @@ +dev_files: + ga_idt: corpus/ud-treebanks-v2.7/UD_Irish-IDT/ga_idt-ud-dev.conllu +language_codes: +- ga_idt +language_map: + ga: ga_idt + ga_idt: ga_idt + irish: ga_idt +test_files: + ga_idt: corpus/ud-treebanks-v2.7/UD_Irish-IDT/ga_idt-ud-test.conllu +train_files: + ga_idt: corpus/ud-treebanks-v2.7/UD_Irish-IDT/ga_idt-ud-train.conllu diff --git a/scripts/train/2.7/language/gd.yaml b/scripts/train/2.7/language/gd.yaml new file mode 100644 index 000000000..35b7423f0 --- /dev/null +++ b/scripts/train/2.7/language/gd.yaml @@ -0,0 +1,12 @@ +dev_files: + gd_arcosg: corpus/ud-treebanks-v2.7/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu +language_codes: +- gd_arcosg +language_map: + gd: gd_arcosg + gd_arcosg: gd_arcosg + scottish gaelic: gd_arcosg +test_files: + gd_arcosg: corpus/ud-treebanks-v2.7/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu +train_files: + gd_arcosg: corpus/ud-treebanks-v2.7/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu diff --git a/scripts/train/2.7/language/gl.yaml b/scripts/train/2.7/language/gl.yaml new file mode 100644 index 000000000..3b0c45ee0 --- /dev/null +++ b/scripts/train/2.7/language/gl.yaml @@ -0,0 +1,17 @@ +dev_files: + gl_ctg: corpus/ud-treebanks-v2.7/UD_Galician-CTG/gl_ctg-ud-dev.conllu + gl_treegal: corpus/ud-treebanks-v2.7/UD_Galician-TreeGal/gl_treegal-ud-test.conllu +language_codes: +- gl_ctg +- gl_treegal +language_map: + galician: gl_ctg + gl: gl_ctg + gl_ctg: gl_ctg + gl_treegal: gl_treegal +test_files: + gl_ctg: corpus/ud-treebanks-v2.7/UD_Galician-CTG/gl_ctg-ud-test.conllu + gl_treegal: corpus/ud-treebanks-v2.7/UD_Galician-TreeGal/gl_treegal-ud-test.conllu +train_files: + gl_ctg: corpus/ud-treebanks-v2.7/UD_Galician-CTG/gl_ctg-ud-train.conllu + gl_treegal: corpus/ud-treebanks-v2.7/UD_Galician-TreeGal/gl_treegal-ud-train.conllu diff --git a/scripts/train/2.7/language/got.yaml b/scripts/train/2.7/language/got.yaml new file mode 100644 index 000000000..43eae14f6 --- /dev/null +++ b/scripts/train/2.7/language/got.yaml @@ -0,0 +1,12 @@ +dev_files: + got_proiel: corpus/ud-treebanks-v2.7/UD_Gothic-PROIEL/got_proiel-ud-dev.conllu +language_codes: +- got_proiel +language_map: + got: got_proiel + got_proiel: got_proiel + gothic: got_proiel +test_files: + got_proiel: corpus/ud-treebanks-v2.7/UD_Gothic-PROIEL/got_proiel-ud-test.conllu +train_files: + got_proiel: corpus/ud-treebanks-v2.7/UD_Gothic-PROIEL/got_proiel-ud-train.conllu diff --git a/scripts/train/2.7/language/grc.yaml b/scripts/train/2.7/language/grc.yaml new file mode 100644 index 000000000..e054d5823 --- /dev/null +++ b/scripts/train/2.7/language/grc.yaml @@ -0,0 +1,17 @@ +dev_files: + grc_perseus: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu + grc_proiel: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu +language_codes: +- grc_perseus +- grc_proiel +language_map: + ancient greek: grc_proiel + grc: grc_proiel + grc_perseus: grc_perseus + grc_proiel: grc_proiel +test_files: + grc_perseus: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu + grc_proiel: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu +train_files: + grc_perseus: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu + grc_proiel: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu diff --git a/scripts/train/2.7/language/he.yaml b/scripts/train/2.7/language/he.yaml new file mode 100644 index 000000000..505483a6f --- /dev/null +++ b/scripts/train/2.7/language/he.yaml @@ -0,0 +1,12 @@ +dev_files: + he_htb: corpus/ud-treebanks-v2.7/UD_Hebrew-HTB/he_htb-ud-dev.conllu +language_codes: +- he_htb +language_map: + he: he_htb + he_htb: he_htb + hebrew: he_htb +test_files: + he_htb: corpus/ud-treebanks-v2.7/UD_Hebrew-HTB/he_htb-ud-test.conllu +train_files: + he_htb: corpus/ud-treebanks-v2.7/UD_Hebrew-HTB/he_htb-ud-train.conllu diff --git a/scripts/train/2.7/language/hi.yaml b/scripts/train/2.7/language/hi.yaml new file mode 100644 index 000000000..ceb9aad90 --- /dev/null +++ b/scripts/train/2.7/language/hi.yaml @@ -0,0 +1,12 @@ +dev_files: + hi_hdtb: corpus/ud-treebanks-v2.7/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu +language_codes: +- hi_hdtb +language_map: + hi: hi_hdtb + hi_hdtb: hi_hdtb + hindi: hi_hdtb +test_files: + hi_hdtb: corpus/ud-treebanks-v2.7/UD_Hindi-HDTB/hi_hdtb-ud-test.conllu +train_files: + hi_hdtb: corpus/ud-treebanks-v2.7/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu diff --git a/scripts/train/2.7/language/hr.yaml b/scripts/train/2.7/language/hr.yaml new file mode 100644 index 000000000..64045b63c --- /dev/null +++ b/scripts/train/2.7/language/hr.yaml @@ -0,0 +1,12 @@ +dev_files: + hr_set: corpus/ud-treebanks-v2.7/UD_Croatian-SET/hr_set-ud-dev.conllu +language_codes: +- hr_set +language_map: + croatian: hr_set + hr: hr_set + hr_set: hr_set +test_files: + hr_set: corpus/ud-treebanks-v2.7/UD_Croatian-SET/hr_set-ud-test.conllu +train_files: + hr_set: corpus/ud-treebanks-v2.7/UD_Croatian-SET/hr_set-ud-train.conllu diff --git a/scripts/train/2.7/language/hsb.yaml b/scripts/train/2.7/language/hsb.yaml new file mode 100644 index 000000000..69bdd068e --- /dev/null +++ b/scripts/train/2.7/language/hsb.yaml @@ -0,0 +1,12 @@ +dev_files: + hsb_ufal: corpus/ud-treebanks-v2.7/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu +language_codes: +- hsb_ufal +language_map: + hsb: hsb_ufal + hsb_ufal: hsb_ufal + upper sorbian: hsb_ufal +test_files: + hsb_ufal: corpus/ud-treebanks-v2.7/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu +train_files: + hsb_ufal: corpus/ud-treebanks-v2.7/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu diff --git a/scripts/train/2.7/language/hu.yaml b/scripts/train/2.7/language/hu.yaml new file mode 100644 index 000000000..9dcf11671 --- /dev/null +++ b/scripts/train/2.7/language/hu.yaml @@ -0,0 +1,12 @@ +dev_files: + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu +language_codes: +- hu_szeged +language_map: + hu: hu_szeged + hu_szeged: hu_szeged + hungarian: hu_szeged +test_files: + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-test.conllu +train_files: + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu diff --git a/scripts/train/2.7/language/hy.yaml b/scripts/train/2.7/language/hy.yaml new file mode 100644 index 000000000..1fa825fdd --- /dev/null +++ b/scripts/train/2.7/language/hy.yaml @@ -0,0 +1,12 @@ +dev_files: + hy_armtdp: corpus/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu +language_codes: +- hy_armtdp +language_map: + armenian: hy_armtdp + hy: hy_armtdp + hy_armtdp: hy_armtdp +test_files: + hy_armtdp: corpus/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu +train_files: + hy_armtdp: corpus/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu diff --git a/scripts/train/2.7/language/id.yaml b/scripts/train/2.7/language/id.yaml new file mode 100644 index 000000000..95c7eacd5 --- /dev/null +++ b/scripts/train/2.7/language/id.yaml @@ -0,0 +1,17 @@ +dev_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-test.conllu + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-dev.conllu +language_codes: +- id_csui +- id_gsd +language_map: + id: id_gsd + id_csui: id_csui + id_gsd: id_gsd + indonesian: id_gsd +test_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-test.conllu + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-test.conllu +train_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-train.conllu + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-train.conllu diff --git a/scripts/train/2.7/language/is.yaml b/scripts/train/2.7/language/is.yaml new file mode 100644 index 000000000..0a9d23814 --- /dev/null +++ b/scripts/train/2.7/language/is.yaml @@ -0,0 +1,12 @@ +dev_files: + is_icepahc: corpus/ud-treebanks-v2.7/UD_Icelandic-IcePaHC/is_icepahc-ud-dev.conllu +language_codes: +- is_icepahc +language_map: + icelandic: is_icepahc + is: is_icepahc + is_icepahc: is_icepahc +test_files: + is_icepahc: corpus/ud-treebanks-v2.7/UD_Icelandic-IcePaHC/is_icepahc-ud-test.conllu +train_files: + is_icepahc: corpus/ud-treebanks-v2.7/UD_Icelandic-IcePaHC/is_icepahc-ud-train.conllu diff --git a/scripts/train/2.7/language/it.yaml b/scripts/train/2.7/language/it.yaml new file mode 100644 index 000000000..8697308c9 --- /dev/null +++ b/scripts/train/2.7/language/it.yaml @@ -0,0 +1,32 @@ +dev_files: + it_isdt: corpus/ud-treebanks-v2.7/UD_Italian-ISDT/it_isdt-ud-dev.conllu + it_partut: corpus/ud-treebanks-v2.7/UD_Italian-ParTUT/it_partut-ud-dev.conllu + it_postwita: corpus/ud-treebanks-v2.7/UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu + it_twittiro: corpus/ud-treebanks-v2.7/UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu + it_vit: corpus/ud-treebanks-v2.7/UD_Italian-VIT/it_vit-ud-dev.conllu +language_codes: +- it_postwita +- it_isdt +- it_partut +- it_twittiro +- it_vit +language_map: + it: it_isdt + it_isdt: it_isdt + it_partut: it_partut + it_postwita: it_postwita + it_twittiro: it_twittiro + it_vit: it_vit + italian: it_isdt +test_files: + it_isdt: corpus/ud-treebanks-v2.7/UD_Italian-ISDT/it_isdt-ud-test.conllu + it_partut: corpus/ud-treebanks-v2.7/UD_Italian-ParTUT/it_partut-ud-test.conllu + it_postwita: corpus/ud-treebanks-v2.7/UD_Italian-PoSTWITA/it_postwita-ud-test.conllu + it_twittiro: corpus/ud-treebanks-v2.7/UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu + it_vit: corpus/ud-treebanks-v2.7/UD_Italian-VIT/it_vit-ud-test.conllu +train_files: + it_isdt: corpus/ud-treebanks-v2.7/UD_Italian-ISDT/it_isdt-ud-train.conllu + it_partut: corpus/ud-treebanks-v2.7/UD_Italian-ParTUT/it_partut-ud-train.conllu + it_postwita: corpus/ud-treebanks-v2.7/UD_Italian-PoSTWITA/it_postwita-ud-train.conllu + it_twittiro: corpus/ud-treebanks-v2.7/UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu + it_vit: corpus/ud-treebanks-v2.7/UD_Italian-VIT/it_vit-ud-train.conllu diff --git a/scripts/train/2.7/language/ja.yaml b/scripts/train/2.7/language/ja.yaml new file mode 100644 index 000000000..7ef98ba08 --- /dev/null +++ b/scripts/train/2.7/language/ja.yaml @@ -0,0 +1,17 @@ +dev_files: + ja_bccwj: corpus/ud-treebanks-v2.7/UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu + ja_gsd: corpus/ud-treebanks-v2.7/UD_Japanese-GSD/ja_gsd-ud-dev.conllu +language_codes: +- ja_bccwj +- ja_gsd +language_map: + ja: ja_gsd + ja_bccwj: ja_bccwj + ja_gsd: ja_gsd + japanese: ja_gsd +test_files: + ja_bccwj: corpus/ud-treebanks-v2.7/UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu + ja_gsd: corpus/ud-treebanks-v2.7/UD_Japanese-GSD/ja_gsd-ud-test.conllu +train_files: + ja_bccwj: corpus/ud-treebanks-v2.7/UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu + ja_gsd: corpus/ud-treebanks-v2.7/UD_Japanese-GSD/ja_gsd-ud-train.conllu diff --git a/scripts/train/2.7/language/kk.yaml b/scripts/train/2.7/language/kk.yaml new file mode 100644 index 000000000..39a14ce30 --- /dev/null +++ b/scripts/train/2.7/language/kk.yaml @@ -0,0 +1,12 @@ +dev_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +language_codes: +- kk_ktb +language_map: + kazakh: kk_ktb + kk: kk_ktb + kk_ktb: kk_ktb +test_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +train_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-train.conllu diff --git a/scripts/train/2.7/language/kmr.yaml b/scripts/train/2.7/language/kmr.yaml new file mode 100644 index 000000000..420f0ec42 --- /dev/null +++ b/scripts/train/2.7/language/kmr.yaml @@ -0,0 +1,12 @@ +dev_files: + kmr_mg: corpus/ud-treebanks-v2.7/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +language_codes: +- kmr_mg +language_map: + kmr: kmr_mg + kmr_mg: kmr_mg + kurmanji: kmr_mg +test_files: + kmr_mg: corpus/ud-treebanks-v2.7/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +train_files: + kmr_mg: corpus/ud-treebanks-v2.7/UD_Kurmanji-MG/kmr_mg-ud-train.conllu diff --git a/scripts/train/2.7/language/ko.yaml b/scripts/train/2.7/language/ko.yaml new file mode 100644 index 000000000..1674811b5 --- /dev/null +++ b/scripts/train/2.7/language/ko.yaml @@ -0,0 +1,17 @@ +dev_files: + ko_gsd: corpus/ud-treebanks-v2.7/UD_Korean-GSD/ko_gsd-ud-dev.conllu + ko_kaist: corpus/ud-treebanks-v2.7/UD_Korean-Kaist/ko_kaist-ud-dev.conllu +language_codes: +- ko_gsd +- ko_kaist +language_map: + ko: ko_gsd + ko_gsd: ko_gsd + ko_kaist: ko_kaist + korean: ko_gsd +test_files: + ko_gsd: corpus/ud-treebanks-v2.7/UD_Korean-GSD/ko_gsd-ud-test.conllu + ko_kaist: corpus/ud-treebanks-v2.7/UD_Korean-Kaist/ko_kaist-ud-test.conllu +train_files: + ko_gsd: corpus/ud-treebanks-v2.7/UD_Korean-GSD/ko_gsd-ud-train.conllu + ko_kaist: corpus/ud-treebanks-v2.7/UD_Korean-Kaist/ko_kaist-ud-train.conllu diff --git a/scripts/train/2.7/language/la.yaml b/scripts/train/2.7/language/la.yaml new file mode 100644 index 000000000..9a0afe5ea --- /dev/null +++ b/scripts/train/2.7/language/la.yaml @@ -0,0 +1,27 @@ +dev_files: + la_ittb: corpus/ud-treebanks-v2.7/UD_Latin-ITTB/la_ittb-ud-dev.conllu + la_llct: corpus/ud-treebanks-v2.7/UD_Latin-LLCT/la_llct-ud-dev.conllu + la_perseus: corpus/ud-treebanks-v2.7/UD_Latin-Perseus/la_perseus-ud-test.conllu + la_proiel: corpus/ud-treebanks-v2.7/UD_Latin-PROIEL/la_proiel-ud-dev.conllu +language_codes: +- la_perseus +- la_proiel +- la_ittb +- la_llct +language_map: + la: la_ittb + la_ittb: la_ittb + la_llct: la_llct + la_perseus: la_perseus + la_proiel: la_proiel + latin: la_ittb +test_files: + la_ittb: corpus/ud-treebanks-v2.7/UD_Latin-ITTB/la_ittb-ud-test.conllu + la_llct: corpus/ud-treebanks-v2.7/UD_Latin-LLCT/la_llct-ud-test.conllu + la_perseus: corpus/ud-treebanks-v2.7/UD_Latin-Perseus/la_perseus-ud-test.conllu + la_proiel: corpus/ud-treebanks-v2.7/UD_Latin-PROIEL/la_proiel-ud-test.conllu +train_files: + la_ittb: corpus/ud-treebanks-v2.7/UD_Latin-ITTB/la_ittb-ud-train.conllu + la_llct: corpus/ud-treebanks-v2.7/UD_Latin-LLCT/la_llct-ud-train.conllu + la_perseus: corpus/ud-treebanks-v2.7/UD_Latin-Perseus/la_perseus-ud-train.conllu + la_proiel: corpus/ud-treebanks-v2.7/UD_Latin-PROIEL/la_proiel-ud-train.conllu diff --git a/scripts/train/2.7/language/lt.yaml b/scripts/train/2.7/language/lt.yaml new file mode 100644 index 000000000..579f895c4 --- /dev/null +++ b/scripts/train/2.7/language/lt.yaml @@ -0,0 +1,17 @@ +dev_files: + lt_alksnis: corpus/ud-treebanks-v2.7/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu + lt_hse: corpus/ud-treebanks-v2.7/UD_Lithuanian-HSE/lt_hse-ud-dev.conllu +language_codes: +- lt_hse +- lt_alksnis +language_map: + lithuanian: lt_alksnis + lt: lt_alksnis + lt_alksnis: lt_alksnis + lt_hse: lt_hse +test_files: + lt_alksnis: corpus/ud-treebanks-v2.7/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu + lt_hse: corpus/ud-treebanks-v2.7/UD_Lithuanian-HSE/lt_hse-ud-test.conllu +train_files: + lt_alksnis: corpus/ud-treebanks-v2.7/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu + lt_hse: corpus/ud-treebanks-v2.7/UD_Lithuanian-HSE/lt_hse-ud-train.conllu diff --git a/scripts/train/2.7/language/lv.yaml b/scripts/train/2.7/language/lv.yaml new file mode 100644 index 000000000..564325e6b --- /dev/null +++ b/scripts/train/2.7/language/lv.yaml @@ -0,0 +1,12 @@ +dev_files: + lv_lvtb: corpus/ud-treebanks-v2.7/UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu +language_codes: +- lv_lvtb +language_map: + latvian: lv_lvtb + lv: lv_lvtb + lv_lvtb: lv_lvtb +test_files: + lv_lvtb: corpus/ud-treebanks-v2.7/UD_Latvian-LVTB/lv_lvtb-ud-test.conllu +train_files: + lv_lvtb: corpus/ud-treebanks-v2.7/UD_Latvian-LVTB/lv_lvtb-ud-train.conllu diff --git a/scripts/train/2.7/language/lzh.yaml b/scripts/train/2.7/language/lzh.yaml new file mode 100644 index 000000000..581ddc1ee --- /dev/null +++ b/scripts/train/2.7/language/lzh.yaml @@ -0,0 +1,12 @@ +dev_files: + lzh_kyoto: corpus/ud-treebanks-v2.7/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu +language_codes: +- lzh_kyoto +language_map: + classical chinese: lzh_kyoto + lzh: lzh_kyoto + lzh_kyoto: lzh_kyoto +test_files: + lzh_kyoto: corpus/ud-treebanks-v2.7/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu +train_files: + lzh_kyoto: corpus/ud-treebanks-v2.7/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu diff --git a/scripts/train/2.7/language/mr.yaml b/scripts/train/2.7/language/mr.yaml new file mode 100644 index 000000000..b61bec4be --- /dev/null +++ b/scripts/train/2.7/language/mr.yaml @@ -0,0 +1,12 @@ +dev_files: + mr_ufal: corpus/ud-treebanks-v2.7/UD_Marathi-UFAL/mr_ufal-ud-dev.conllu +language_codes: +- mr_ufal +language_map: + marathi: mr_ufal + mr: mr_ufal + mr_ufal: mr_ufal +test_files: + mr_ufal: corpus/ud-treebanks-v2.7/UD_Marathi-UFAL/mr_ufal-ud-test.conllu +train_files: + mr_ufal: corpus/ud-treebanks-v2.7/UD_Marathi-UFAL/mr_ufal-ud-train.conllu diff --git a/scripts/train/2.7/language/mt.yaml b/scripts/train/2.7/language/mt.yaml new file mode 100644 index 000000000..5a81a1ec2 --- /dev/null +++ b/scripts/train/2.7/language/mt.yaml @@ -0,0 +1,12 @@ +dev_files: + mt_mudt: corpus/ud-treebanks-v2.7/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu +language_codes: +- mt_mudt +language_map: + maltese: mt_mudt + mt: mt_mudt + mt_mudt: mt_mudt +test_files: + mt_mudt: corpus/ud-treebanks-v2.7/UD_Maltese-MUDT/mt_mudt-ud-test.conllu +train_files: + mt_mudt: corpus/ud-treebanks-v2.7/UD_Maltese-MUDT/mt_mudt-ud-train.conllu diff --git a/scripts/train/2.7/language/nl.yaml b/scripts/train/2.7/language/nl.yaml new file mode 100644 index 000000000..d3801992f --- /dev/null +++ b/scripts/train/2.7/language/nl.yaml @@ -0,0 +1,17 @@ +dev_files: + nl_alpino: corpus/ud-treebanks-v2.7/UD_Dutch-Alpino/nl_alpino-ud-dev.conllu + nl_lassysmall: corpus/ud-treebanks-v2.7/UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu +language_codes: +- nl_alpino +- nl_lassysmall +language_map: + dutch: nl_alpino + nl: nl_alpino + nl_alpino: nl_alpino + nl_lassysmall: nl_lassysmall +test_files: + nl_alpino: corpus/ud-treebanks-v2.7/UD_Dutch-Alpino/nl_alpino-ud-test.conllu + nl_lassysmall: corpus/ud-treebanks-v2.7/UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu +train_files: + nl_alpino: corpus/ud-treebanks-v2.7/UD_Dutch-Alpino/nl_alpino-ud-train.conllu + nl_lassysmall: corpus/ud-treebanks-v2.7/UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu diff --git a/scripts/train/2.7/language/no.yaml b/scripts/train/2.7/language/no.yaml new file mode 100644 index 000000000..2d40e0f19 --- /dev/null +++ b/scripts/train/2.7/language/no.yaml @@ -0,0 +1,22 @@ +dev_files: + no_bokmaal: corpus/ud-treebanks-v2.7/UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu + no_nynorsk: corpus/ud-treebanks-v2.7/UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu + no_nynorsklia: corpus/ud-treebanks-v2.7/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu +language_codes: +- no_nynorsklia +- no_bokmaal +- no_nynorsk +language_map: + 'no': no_bokmaal + no_bokmaal: no_bokmaal + no_nynorsk: no_nynorsk + no_nynorsklia: no_nynorsklia + norwegian: no_bokmaal +test_files: + no_bokmaal: corpus/ud-treebanks-v2.7/UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu + no_nynorsk: corpus/ud-treebanks-v2.7/UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu + no_nynorsklia: corpus/ud-treebanks-v2.7/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu +train_files: + no_bokmaal: corpus/ud-treebanks-v2.7/UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu + no_nynorsk: corpus/ud-treebanks-v2.7/UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu + no_nynorsklia: corpus/ud-treebanks-v2.7/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu diff --git a/scripts/train/2.7/language/olo.yaml b/scripts/train/2.7/language/olo.yaml new file mode 100644 index 000000000..404ec6828 --- /dev/null +++ b/scripts/train/2.7/language/olo.yaml @@ -0,0 +1,12 @@ +dev_files: + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +language_codes: +- olo_kkpp +language_map: + livvi: olo_kkpp + olo: olo_kkpp + olo_kkpp: olo_kkpp +test_files: + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +train_files: + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu diff --git a/scripts/train/2.7/language/pcm.yaml b/scripts/train/2.7/language/pcm.yaml new file mode 100644 index 000000000..15aaf4853 --- /dev/null +++ b/scripts/train/2.7/language/pcm.yaml @@ -0,0 +1,12 @@ +dev_files: + pcm_nsc: corpus/ud-treebanks-v2.7/UD_Naija-NSC/pcm_nsc-ud-dev.conllu +language_codes: +- pcm_nsc +language_map: + naija: pcm_nsc + pcm: pcm_nsc + pcm_nsc: pcm_nsc +test_files: + pcm_nsc: corpus/ud-treebanks-v2.7/UD_Naija-NSC/pcm_nsc-ud-test.conllu +train_files: + pcm_nsc: corpus/ud-treebanks-v2.7/UD_Naija-NSC/pcm_nsc-ud-train.conllu diff --git a/scripts/train/2.7/language/pl.yaml b/scripts/train/2.7/language/pl.yaml new file mode 100644 index 000000000..2bd82c559 --- /dev/null +++ b/scripts/train/2.7/language/pl.yaml @@ -0,0 +1,17 @@ +dev_files: + pl_lfg: corpus/ud-treebanks-v2.7/UD_Polish-LFG/pl_lfg-ud-dev.conllu + pl_pdb: corpus/ud-treebanks-v2.7/UD_Polish-PDB/pl_pdb-ud-dev.conllu +language_codes: +- pl_pdb +- pl_lfg +language_map: + pl: pl_pdb + pl_lfg: pl_lfg + pl_pdb: pl_pdb + polish: pl_pdb +test_files: + pl_lfg: corpus/ud-treebanks-v2.7/UD_Polish-LFG/pl_lfg-ud-test.conllu + pl_pdb: corpus/ud-treebanks-v2.7/UD_Polish-PDB/pl_pdb-ud-test.conllu +train_files: + pl_lfg: corpus/ud-treebanks-v2.7/UD_Polish-LFG/pl_lfg-ud-train.conllu + pl_pdb: corpus/ud-treebanks-v2.7/UD_Polish-PDB/pl_pdb-ud-train.conllu diff --git a/scripts/train/2.7/language/pt.yaml b/scripts/train/2.7/language/pt.yaml new file mode 100644 index 000000000..cdcac7465 --- /dev/null +++ b/scripts/train/2.7/language/pt.yaml @@ -0,0 +1,17 @@ +dev_files: + pt_bosque: corpus/ud-treebanks-v2.7/UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu + pt_gsd: corpus/ud-treebanks-v2.7/UD_Portuguese-GSD/pt_gsd-ud-dev.conllu +language_codes: +- pt_bosque +- pt_gsd +language_map: + portuguese: pt_bosque + pt: pt_bosque + pt_bosque: pt_bosque + pt_gsd: pt_gsd +test_files: + pt_bosque: corpus/ud-treebanks-v2.7/UD_Portuguese-Bosque/pt_bosque-ud-test.conllu + pt_gsd: corpus/ud-treebanks-v2.7/UD_Portuguese-GSD/pt_gsd-ud-test.conllu +train_files: + pt_bosque: corpus/ud-treebanks-v2.7/UD_Portuguese-Bosque/pt_bosque-ud-train.conllu + pt_gsd: corpus/ud-treebanks-v2.7/UD_Portuguese-GSD/pt_gsd-ud-train.conllu diff --git a/scripts/train/2.7/language/ro.yaml b/scripts/train/2.7/language/ro.yaml new file mode 100644 index 000000000..69989f236 --- /dev/null +++ b/scripts/train/2.7/language/ro.yaml @@ -0,0 +1,17 @@ +dev_files: + ro_nonstandard: corpus/ud-treebanks-v2.7/UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu + ro_rrt: corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-dev.conllu +language_codes: +- ro_nonstandard +- ro_rrt +language_map: + ro: ro_rrt + ro_nonstandard: ro_nonstandard + ro_rrt: ro_rrt + romanian: ro_rrt +test_files: + ro_nonstandard: corpus/ud-treebanks-v2.7/UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu + ro_rrt: corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-test.conllu +train_files: + ro_nonstandard: corpus/ud-treebanks-v2.7/UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu + ro_rrt: corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-train.conllu diff --git a/scripts/train/2.7/language/ru.yaml b/scripts/train/2.7/language/ru.yaml new file mode 100644 index 000000000..fb6726dbc --- /dev/null +++ b/scripts/train/2.7/language/ru.yaml @@ -0,0 +1,22 @@ +dev_files: + ru_gsd: corpus/ud-treebanks-v2.7/UD_Russian-GSD/ru_gsd-ud-dev.conllu + ru_syntagrus: corpus/ud-treebanks-v2.7/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu + ru_taiga: corpus/ud-treebanks-v2.7/UD_Russian-Taiga/ru_taiga-ud-dev.conllu +language_codes: +- ru_syntagrus +- ru_gsd +- ru_taiga +language_map: + ru: ru_syntagrus + ru_gsd: ru_gsd + ru_syntagrus: ru_syntagrus + ru_taiga: ru_taiga + russian: ru_syntagrus +test_files: + ru_gsd: corpus/ud-treebanks-v2.7/UD_Russian-GSD/ru_gsd-ud-test.conllu + ru_syntagrus: corpus/ud-treebanks-v2.7/UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu + ru_taiga: corpus/ud-treebanks-v2.7/UD_Russian-Taiga/ru_taiga-ud-test.conllu +train_files: + ru_gsd: corpus/ud-treebanks-v2.7/UD_Russian-GSD/ru_gsd-ud-train.conllu + ru_syntagrus: corpus/ud-treebanks-v2.7/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu + ru_taiga: corpus/ud-treebanks-v2.7/UD_Russian-Taiga/ru_taiga-ud-train.conllu diff --git a/scripts/train/2.7/language/sa.yaml b/scripts/train/2.7/language/sa.yaml new file mode 100644 index 000000000..0d63b3b96 --- /dev/null +++ b/scripts/train/2.7/language/sa.yaml @@ -0,0 +1,12 @@ +dev_files: + sa_vedic: corpus/ud-treebanks-v2.7/UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu +language_codes: +- sa_vedic +language_map: + sa: sa_vedic + sa_vedic: sa_vedic + sanskrit: sa_vedic +test_files: + sa_vedic: corpus/ud-treebanks-v2.7/UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu +train_files: + sa_vedic: corpus/ud-treebanks-v2.7/UD_Sanskrit-Vedic/sa_vedic-ud-train.conllu diff --git a/scripts/train/2.7/language/sk.yaml b/scripts/train/2.7/language/sk.yaml new file mode 100644 index 000000000..db44ed17d --- /dev/null +++ b/scripts/train/2.7/language/sk.yaml @@ -0,0 +1,12 @@ +dev_files: + sk_snk: corpus/ud-treebanks-v2.7/UD_Slovak-SNK/sk_snk-ud-dev.conllu +language_codes: +- sk_snk +language_map: + sk: sk_snk + sk_snk: sk_snk + slovak: sk_snk +test_files: + sk_snk: corpus/ud-treebanks-v2.7/UD_Slovak-SNK/sk_snk-ud-test.conllu +train_files: + sk_snk: corpus/ud-treebanks-v2.7/UD_Slovak-SNK/sk_snk-ud-train.conllu diff --git a/scripts/train/2.7/language/sl.yaml b/scripts/train/2.7/language/sl.yaml new file mode 100644 index 000000000..37c93a8a6 --- /dev/null +++ b/scripts/train/2.7/language/sl.yaml @@ -0,0 +1,17 @@ +dev_files: + sl_ssj: corpus/ud-treebanks-v2.7/UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu + sl_sst: corpus/ud-treebanks-v2.7/UD_Slovenian-SST/sl_sst-ud-test.conllu +language_codes: +- sl_sst +- sl_ssj +language_map: + sl: sl_ssj + sl_ssj: sl_ssj + sl_sst: sl_sst + slovenian: sl_ssj +test_files: + sl_ssj: corpus/ud-treebanks-v2.7/UD_Slovenian-SSJ/sl_ssj-ud-test.conllu + sl_sst: corpus/ud-treebanks-v2.7/UD_Slovenian-SST/sl_sst-ud-test.conllu +train_files: + sl_ssj: corpus/ud-treebanks-v2.7/UD_Slovenian-SSJ/sl_ssj-ud-train.conllu + sl_sst: corpus/ud-treebanks-v2.7/UD_Slovenian-SST/sl_sst-ud-train.conllu diff --git a/scripts/train/2.7/language/sme.yaml b/scripts/train/2.7/language/sme.yaml new file mode 100644 index 000000000..21e428a4b --- /dev/null +++ b/scripts/train/2.7/language/sme.yaml @@ -0,0 +1,12 @@ +dev_files: + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-test.conllu +language_codes: +- sme_giella +language_map: + north sami: sme_giella + sme: sme_giella + sme_giella: sme_giella +test_files: + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-test.conllu +train_files: + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-train.conllu diff --git a/scripts/train/2.7/language/sr.yaml b/scripts/train/2.7/language/sr.yaml new file mode 100644 index 000000000..4a39d5917 --- /dev/null +++ b/scripts/train/2.7/language/sr.yaml @@ -0,0 +1,12 @@ +dev_files: + sr_set: corpus/ud-treebanks-v2.7/UD_Serbian-SET/sr_set-ud-dev.conllu +language_codes: +- sr_set +language_map: + serbian: sr_set + sr: sr_set + sr_set: sr_set +test_files: + sr_set: corpus/ud-treebanks-v2.7/UD_Serbian-SET/sr_set-ud-test.conllu +train_files: + sr_set: corpus/ud-treebanks-v2.7/UD_Serbian-SET/sr_set-ud-train.conllu diff --git a/scripts/train/2.7/language/sv.yaml b/scripts/train/2.7/language/sv.yaml new file mode 100644 index 000000000..d0ee520c8 --- /dev/null +++ b/scripts/train/2.7/language/sv.yaml @@ -0,0 +1,17 @@ +dev_files: + sv_lines: corpus/ud-treebanks-v2.7/UD_Swedish-LinES/sv_lines-ud-dev.conllu + sv_talbanken: corpus/ud-treebanks-v2.7/UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu +language_codes: +- sv_lines +- sv_talbanken +language_map: + sv: sv_talbanken + sv_lines: sv_lines + sv_talbanken: sv_talbanken + swedish: sv_talbanken +test_files: + sv_lines: corpus/ud-treebanks-v2.7/UD_Swedish-LinES/sv_lines-ud-test.conllu + sv_talbanken: corpus/ud-treebanks-v2.7/UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu +train_files: + sv_lines: corpus/ud-treebanks-v2.7/UD_Swedish-LinES/sv_lines-ud-train.conllu + sv_talbanken: corpus/ud-treebanks-v2.7/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu diff --git a/scripts/train/2.7/language/swl.yaml b/scripts/train/2.7/language/swl.yaml new file mode 100644 index 000000000..2e406e20f --- /dev/null +++ b/scripts/train/2.7/language/swl.yaml @@ -0,0 +1,12 @@ +dev_files: + swl_sslc: corpus/ud-treebanks-v2.7/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu +language_codes: +- swl_sslc +language_map: + swedish sign language: swl_sslc + swl: swl_sslc + swl_sslc: swl_sslc +test_files: + swl_sslc: corpus/ud-treebanks-v2.7/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu +train_files: + swl_sslc: corpus/ud-treebanks-v2.7/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu diff --git a/scripts/train/2.7/language/ta.yaml b/scripts/train/2.7/language/ta.yaml new file mode 100644 index 000000000..3f04b99e0 --- /dev/null +++ b/scripts/train/2.7/language/ta.yaml @@ -0,0 +1,12 @@ +dev_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-dev.conllu +language_codes: +- ta_ttb +language_map: + ta: ta_ttb + ta_ttb: ta_ttb + tamil: ta_ttb +test_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-test.conllu +train_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-train.conllu diff --git a/scripts/train/2.7/language/te.yaml b/scripts/train/2.7/language/te.yaml new file mode 100644 index 000000000..184544688 --- /dev/null +++ b/scripts/train/2.7/language/te.yaml @@ -0,0 +1,12 @@ +dev_files: + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-dev.conllu +language_codes: +- te_mtg +language_map: + te: te_mtg + te_mtg: te_mtg + telugu: te_mtg +test_files: + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-test.conllu +train_files: + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-train.conllu diff --git a/scripts/train/2.7/language/tr.yaml b/scripts/train/2.7/language/tr.yaml new file mode 100644 index 000000000..02b0d40ab --- /dev/null +++ b/scripts/train/2.7/language/tr.yaml @@ -0,0 +1,17 @@ +dev_files: + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-dev.conllu + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-dev.conllu +language_codes: +- tr_imst +- tr_boun +language_map: + tr: tr_imst + tr_boun: tr_boun + tr_imst: tr_imst + turkish: tr_imst +test_files: + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-test.conllu + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-test.conllu +train_files: + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-train.conllu + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-train.conllu diff --git a/scripts/train/2.7/language/ug.yaml b/scripts/train/2.7/language/ug.yaml new file mode 100644 index 000000000..070d6e58e --- /dev/null +++ b/scripts/train/2.7/language/ug.yaml @@ -0,0 +1,12 @@ +dev_files: + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-dev.conllu +language_codes: +- ug_udt +language_map: + ug: ug_udt + ug_udt: ug_udt + uyghur: ug_udt +test_files: + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-test.conllu +train_files: + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-train.conllu diff --git a/scripts/train/2.7/language/uk.yaml b/scripts/train/2.7/language/uk.yaml new file mode 100644 index 000000000..0fdba6a13 --- /dev/null +++ b/scripts/train/2.7/language/uk.yaml @@ -0,0 +1,12 @@ +dev_files: + uk_iu: corpus/ud-treebanks-v2.7/UD_Ukrainian-IU/uk_iu-ud-dev.conllu +language_codes: +- uk_iu +language_map: + uk: uk_iu + uk_iu: uk_iu + ukrainian: uk_iu +test_files: + uk_iu: corpus/ud-treebanks-v2.7/UD_Ukrainian-IU/uk_iu-ud-test.conllu +train_files: + uk_iu: corpus/ud-treebanks-v2.7/UD_Ukrainian-IU/uk_iu-ud-train.conllu diff --git a/scripts/train/2.7/language/ur.yaml b/scripts/train/2.7/language/ur.yaml new file mode 100644 index 000000000..f3012b293 --- /dev/null +++ b/scripts/train/2.7/language/ur.yaml @@ -0,0 +1,12 @@ +dev_files: + ur_udtb: corpus/ud-treebanks-v2.7/UD_Urdu-UDTB/ur_udtb-ud-dev.conllu +language_codes: +- ur_udtb +language_map: + ur: ur_udtb + ur_udtb: ur_udtb + urdu: ur_udtb +test_files: + ur_udtb: corpus/ud-treebanks-v2.7/UD_Urdu-UDTB/ur_udtb-ud-test.conllu +train_files: + ur_udtb: corpus/ud-treebanks-v2.7/UD_Urdu-UDTB/ur_udtb-ud-train.conllu diff --git a/scripts/train/2.7/language/vi.yaml b/scripts/train/2.7/language/vi.yaml new file mode 100644 index 000000000..d2f10054f --- /dev/null +++ b/scripts/train/2.7/language/vi.yaml @@ -0,0 +1,12 @@ +dev_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu +language_codes: +- vi_vtb +language_map: + vi: vi_vtb + vi_vtb: vi_vtb + vietnamese: vi_vtb +test_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-test.conllu +train_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu diff --git a/scripts/train/2.7/language/wo.yaml b/scripts/train/2.7/language/wo.yaml new file mode 100644 index 000000000..22f0e9d47 --- /dev/null +++ b/scripts/train/2.7/language/wo.yaml @@ -0,0 +1,12 @@ +dev_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-dev.conllu +language_codes: +- wo_wtb +language_map: + wo: wo_wtb + wo_wtb: wo_wtb + wolof: wo_wtb +test_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-test.conllu +train_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-train.conllu diff --git a/scripts/train/2.7/language/zh.yaml b/scripts/train/2.7/language/zh.yaml new file mode 100644 index 000000000..93973c66a --- /dev/null +++ b/scripts/train/2.7/language/zh.yaml @@ -0,0 +1,17 @@ +dev_files: + zh_gsd: corpus/ud-treebanks-v2.7/UD_Chinese-GSD/zh_gsd-ud-dev.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.7/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu +language_codes: +- zh_gsdsimp +- zh_gsd +language_map: + chinese: zh_gsdsimp + zh: zh_gsdsimp + zh_gsd: zh_gsd + zh_gsdsimp: zh_gsdsimp +test_files: + zh_gsd: corpus/ud-treebanks-v2.7/UD_Chinese-GSD/zh_gsd-ud-test.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.7/UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu +train_files: + zh_gsd: corpus/ud-treebanks-v2.7/UD_Chinese-GSD/zh_gsd-ud-train.conllu + zh_gsdsimp: corpus/ud-treebanks-v2.7/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu diff --git a/scripts/train/2.7/treebank/af_afribooms.yaml b/scripts/train/2.7/treebank/af_afribooms.yaml new file mode 100644 index 000000000..b6abfdf34 --- /dev/null +++ b/scripts/train/2.7/treebank/af_afribooms.yaml @@ -0,0 +1,10 @@ +dev_files: + af_afribooms: corpus/ud-treebanks-v2.7/UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu +language_codes: +- af_afribooms +language_map: + af_afribooms: af_afribooms +test_files: + af_afribooms: corpus/ud-treebanks-v2.7/UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu +train_files: + af_afribooms: corpus/ud-treebanks-v2.7/UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ar_nyuad.yaml b/scripts/train/2.7/treebank/ar_nyuad.yaml new file mode 100644 index 000000000..9e55c4e35 --- /dev/null +++ b/scripts/train/2.7/treebank/ar_nyuad.yaml @@ -0,0 +1,10 @@ +dev_files: + ar_nyuad: corpus/ud-treebanks-v2.7/UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu +language_codes: +- ar_nyuad +language_map: + ar_nyuad: ar_nyuad +test_files: + ar_nyuad: corpus/ud-treebanks-v2.7/UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu +train_files: + ar_nyuad: corpus/ud-treebanks-v2.7/UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ar_padt.yaml b/scripts/train/2.7/treebank/ar_padt.yaml new file mode 100644 index 000000000..7aa599d4e --- /dev/null +++ b/scripts/train/2.7/treebank/ar_padt.yaml @@ -0,0 +1,10 @@ +dev_files: + ar_padt: corpus/ud-treebanks-v2.7/UD_Arabic-PADT/ar_padt-ud-dev.conllu +language_codes: +- ar_padt +language_map: + ar_padt: ar_padt +test_files: + ar_padt: corpus/ud-treebanks-v2.7/UD_Arabic-PADT/ar_padt-ud-test.conllu +train_files: + ar_padt: corpus/ud-treebanks-v2.7/UD_Arabic-PADT/ar_padt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/be_hse.yaml b/scripts/train/2.7/treebank/be_hse.yaml new file mode 100644 index 000000000..2708c15be --- /dev/null +++ b/scripts/train/2.7/treebank/be_hse.yaml @@ -0,0 +1,10 @@ +dev_files: + be_hse: corpus/ud-treebanks-v2.7/UD_Belarusian-HSE/be_hse-ud-dev.conllu +language_codes: +- be_hse +language_map: + be_hse: be_hse +test_files: + be_hse: corpus/ud-treebanks-v2.7/UD_Belarusian-HSE/be_hse-ud-test.conllu +train_files: + be_hse: corpus/ud-treebanks-v2.7/UD_Belarusian-HSE/be_hse-ud-train.conllu diff --git a/scripts/train/2.7/treebank/bg_btb.yaml b/scripts/train/2.7/treebank/bg_btb.yaml new file mode 100644 index 000000000..71be2f0ae --- /dev/null +++ b/scripts/train/2.7/treebank/bg_btb.yaml @@ -0,0 +1,10 @@ +dev_files: + bg_btb: corpus/ud-treebanks-v2.7/UD_Bulgarian-BTB/bg_btb-ud-dev.conllu +language_codes: +- bg_btb +language_map: + bg_btb: bg_btb +test_files: + bg_btb: corpus/ud-treebanks-v2.7/UD_Bulgarian-BTB/bg_btb-ud-test.conllu +train_files: + bg_btb: corpus/ud-treebanks-v2.7/UD_Bulgarian-BTB/bg_btb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/bxr_bdt.yaml b/scripts/train/2.7/treebank/bxr_bdt.yaml new file mode 100644 index 000000000..a36d88744 --- /dev/null +++ b/scripts/train/2.7/treebank/bxr_bdt.yaml @@ -0,0 +1,10 @@ +dev_files: + bxr_bdt: corpus/ud-treebanks-v2.7/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +language_codes: +- bxr_bdt +language_map: + bxr_bdt: bxr_bdt +test_files: + bxr_bdt: corpus/ud-treebanks-v2.7/UD_Buryat-BDT/bxr_bdt-ud-test.conllu +train_files: + bxr_bdt: corpus/ud-treebanks-v2.7/UD_Buryat-BDT/bxr_bdt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ca_ancora.yaml b/scripts/train/2.7/treebank/ca_ancora.yaml new file mode 100644 index 000000000..125241e08 --- /dev/null +++ b/scripts/train/2.7/treebank/ca_ancora.yaml @@ -0,0 +1,10 @@ +dev_files: + ca_ancora: corpus/ud-treebanks-v2.7/UD_Catalan-AnCora/ca_ancora-ud-dev.conllu +language_codes: +- ca_ancora +language_map: + ca_ancora: ca_ancora +test_files: + ca_ancora: corpus/ud-treebanks-v2.7/UD_Catalan-AnCora/ca_ancora-ud-test.conllu +train_files: + ca_ancora: corpus/ud-treebanks-v2.7/UD_Catalan-AnCora/ca_ancora-ud-train.conllu diff --git a/scripts/train/2.7/treebank/cop_scriptorium.yaml b/scripts/train/2.7/treebank/cop_scriptorium.yaml new file mode 100644 index 000000000..e72b86829 --- /dev/null +++ b/scripts/train/2.7/treebank/cop_scriptorium.yaml @@ -0,0 +1,10 @@ +dev_files: + cop_scriptorium: corpus/ud-treebanks-v2.7/UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu +language_codes: +- cop_scriptorium +language_map: + cop_scriptorium: cop_scriptorium +test_files: + cop_scriptorium: corpus/ud-treebanks-v2.7/UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu +train_files: + cop_scriptorium: corpus/ud-treebanks-v2.7/UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu diff --git a/scripts/train/2.7/treebank/cs_cac.yaml b/scripts/train/2.7/treebank/cs_cac.yaml new file mode 100644 index 000000000..3fb055760 --- /dev/null +++ b/scripts/train/2.7/treebank/cs_cac.yaml @@ -0,0 +1,10 @@ +dev_files: + cs_cac: corpus/ud-treebanks-v2.7/UD_Czech-CAC/cs_cac-ud-dev.conllu +language_codes: +- cs_cac +language_map: + cs_cac: cs_cac +test_files: + cs_cac: corpus/ud-treebanks-v2.7/UD_Czech-CAC/cs_cac-ud-test.conllu +train_files: + cs_cac: corpus/ud-treebanks-v2.7/UD_Czech-CAC/cs_cac-ud-train.conllu diff --git a/scripts/train/2.7/treebank/cs_cltt.yaml b/scripts/train/2.7/treebank/cs_cltt.yaml new file mode 100644 index 000000000..418c8911d --- /dev/null +++ b/scripts/train/2.7/treebank/cs_cltt.yaml @@ -0,0 +1,10 @@ +dev_files: + cs_cltt: corpus/ud-treebanks-v2.7/UD_Czech-CLTT/cs_cltt-ud-dev.conllu +language_codes: +- cs_cltt +language_map: + cs_cltt: cs_cltt +test_files: + cs_cltt: corpus/ud-treebanks-v2.7/UD_Czech-CLTT/cs_cltt-ud-test.conllu +train_files: + cs_cltt: corpus/ud-treebanks-v2.7/UD_Czech-CLTT/cs_cltt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/cs_fictree.yaml b/scripts/train/2.7/treebank/cs_fictree.yaml new file mode 100644 index 000000000..07cbcab9b --- /dev/null +++ b/scripts/train/2.7/treebank/cs_fictree.yaml @@ -0,0 +1,10 @@ +dev_files: + cs_fictree: corpus/ud-treebanks-v2.7/UD_Czech-FicTree/cs_fictree-ud-dev.conllu +language_codes: +- cs_fictree +language_map: + cs_fictree: cs_fictree +test_files: + cs_fictree: corpus/ud-treebanks-v2.7/UD_Czech-FicTree/cs_fictree-ud-test.conllu +train_files: + cs_fictree: corpus/ud-treebanks-v2.7/UD_Czech-FicTree/cs_fictree-ud-train.conllu diff --git a/scripts/train/2.7/treebank/cs_pdt.yaml b/scripts/train/2.7/treebank/cs_pdt.yaml new file mode 100644 index 000000000..d0b0195e5 --- /dev/null +++ b/scripts/train/2.7/treebank/cs_pdt.yaml @@ -0,0 +1,10 @@ +dev_files: + cs_pdt: corpus/ud-treebanks-v2.7/UD_Czech-PDT/cs_pdt-ud-dev.conllu +language_codes: +- cs_pdt +language_map: + cs_pdt: cs_pdt +test_files: + cs_pdt: corpus/ud-treebanks-v2.7/UD_Czech-PDT/cs_pdt-ud-test.conllu +train_files: + cs_pdt: corpus/ud-treebanks-v2.7/UD_Czech-PDT/cs_pdt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/cu_proiel.yaml b/scripts/train/2.7/treebank/cu_proiel.yaml new file mode 100644 index 000000000..4387bc45f --- /dev/null +++ b/scripts/train/2.7/treebank/cu_proiel.yaml @@ -0,0 +1,10 @@ +dev_files: + cu_proiel: corpus/ud-treebanks-v2.7/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu +language_codes: +- cu_proiel +language_map: + cu_proiel: cu_proiel +test_files: + cu_proiel: corpus/ud-treebanks-v2.7/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu +train_files: + cu_proiel: corpus/ud-treebanks-v2.7/UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu diff --git a/scripts/train/2.7/treebank/cy_ccg.yaml b/scripts/train/2.7/treebank/cy_ccg.yaml new file mode 100644 index 000000000..6f064e538 --- /dev/null +++ b/scripts/train/2.7/treebank/cy_ccg.yaml @@ -0,0 +1,10 @@ +dev_files: + cy_ccg: corpus/ud-treebanks-v2.7/UD_Welsh-CCG/cy_ccg-ud-test.conllu +language_codes: +- cy_ccg +language_map: + cy_ccg: cy_ccg +test_files: + cy_ccg: corpus/ud-treebanks-v2.7/UD_Welsh-CCG/cy_ccg-ud-test.conllu +train_files: + cy_ccg: corpus/ud-treebanks-v2.7/UD_Welsh-CCG/cy_ccg-ud-train.conllu diff --git a/scripts/train/2.7/treebank/da_ddt.yaml b/scripts/train/2.7/treebank/da_ddt.yaml new file mode 100644 index 000000000..1ed36a963 --- /dev/null +++ b/scripts/train/2.7/treebank/da_ddt.yaml @@ -0,0 +1,10 @@ +dev_files: + da_ddt: corpus/ud-treebanks-v2.7/UD_Danish-DDT/da_ddt-ud-dev.conllu +language_codes: +- da_ddt +language_map: + da_ddt: da_ddt +test_files: + da_ddt: corpus/ud-treebanks-v2.7/UD_Danish-DDT/da_ddt-ud-test.conllu +train_files: + da_ddt: corpus/ud-treebanks-v2.7/UD_Danish-DDT/da_ddt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/de_gsd.yaml b/scripts/train/2.7/treebank/de_gsd.yaml new file mode 100644 index 000000000..22666f67a --- /dev/null +++ b/scripts/train/2.7/treebank/de_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + de_gsd: corpus/ud-treebanks-v2.7/UD_German-GSD/de_gsd-ud-dev.conllu +language_codes: +- de_gsd +language_map: + de_gsd: de_gsd +test_files: + de_gsd: corpus/ud-treebanks-v2.7/UD_German-GSD/de_gsd-ud-test.conllu +train_files: + de_gsd: corpus/ud-treebanks-v2.7/UD_German-GSD/de_gsd-ud-train.conllu diff --git a/scripts/train/2.7/treebank/de_hdt.yaml b/scripts/train/2.7/treebank/de_hdt.yaml new file mode 100644 index 000000000..e3c06cf0d --- /dev/null +++ b/scripts/train/2.7/treebank/de_hdt.yaml @@ -0,0 +1,10 @@ +dev_files: + de_hdt: corpus/ud-treebanks-v2.7/UD_German-HDT/de_hdt-ud-dev.conllu +language_codes: +- de_hdt +language_map: + de_hdt: de_hdt +test_files: + de_hdt: corpus/ud-treebanks-v2.7/UD_German-HDT/de_hdt-ud-test.conllu +train_files: + de_hdt: corpus/ud-treebanks-v2.7/UD_German-HDT/de_hdt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/el_gdt.yaml b/scripts/train/2.7/treebank/el_gdt.yaml new file mode 100644 index 000000000..399d58274 --- /dev/null +++ b/scripts/train/2.7/treebank/el_gdt.yaml @@ -0,0 +1,10 @@ +dev_files: + el_gdt: corpus/ud-treebanks-v2.7/UD_Greek-GDT/el_gdt-ud-dev.conllu +language_codes: +- el_gdt +language_map: + el_gdt: el_gdt +test_files: + el_gdt: corpus/ud-treebanks-v2.7/UD_Greek-GDT/el_gdt-ud-test.conllu +train_files: + el_gdt: corpus/ud-treebanks-v2.7/UD_Greek-GDT/el_gdt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/en_esl.yaml b/scripts/train/2.7/treebank/en_esl.yaml new file mode 100644 index 000000000..3f8870ae1 --- /dev/null +++ b/scripts/train/2.7/treebank/en_esl.yaml @@ -0,0 +1,10 @@ +dev_files: + en_esl: corpus/ud-treebanks-v2.7/UD_English-ESL/en_esl-ud-dev.conllu +language_codes: +- en_esl +language_map: + en_esl: en_esl +test_files: + en_esl: corpus/ud-treebanks-v2.7/UD_English-ESL/en_esl-ud-test.conllu +train_files: + en_esl: corpus/ud-treebanks-v2.7/UD_English-ESL/en_esl-ud-train.conllu diff --git a/scripts/train/2.7/treebank/en_ewt.yaml b/scripts/train/2.7/treebank/en_ewt.yaml new file mode 100644 index 000000000..004ddac08 --- /dev/null +++ b/scripts/train/2.7/treebank/en_ewt.yaml @@ -0,0 +1,10 @@ +dev_files: + en_ewt: corpus/ud-treebanks-v2.7/UD_English-EWT/en_ewt-ud-dev.conllu +language_codes: +- en_ewt +language_map: + en_ewt: en_ewt +test_files: + en_ewt: corpus/ud-treebanks-v2.7/UD_English-EWT/en_ewt-ud-test.conllu +train_files: + en_ewt: corpus/ud-treebanks-v2.7/UD_English-EWT/en_ewt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/en_gum.yaml b/scripts/train/2.7/treebank/en_gum.yaml new file mode 100644 index 000000000..5d66bdf8c --- /dev/null +++ b/scripts/train/2.7/treebank/en_gum.yaml @@ -0,0 +1,10 @@ +dev_files: + en_gum: corpus/ud-treebanks-v2.7/UD_English-GUM/en_gum-ud-dev.conllu +language_codes: +- en_gum +language_map: + en_gum: en_gum +test_files: + en_gum: corpus/ud-treebanks-v2.7/UD_English-GUM/en_gum-ud-test.conllu +train_files: + en_gum: corpus/ud-treebanks-v2.7/UD_English-GUM/en_gum-ud-train.conllu diff --git a/scripts/train/2.7/treebank/en_gumreddit.yaml b/scripts/train/2.7/treebank/en_gumreddit.yaml new file mode 100644 index 000000000..d9c7e9874 --- /dev/null +++ b/scripts/train/2.7/treebank/en_gumreddit.yaml @@ -0,0 +1,10 @@ +dev_files: + en_gumreddit: corpus/ud-treebanks-v2.7/UD_English-GUMReddit/en_gumreddit-ud-dev.conllu +language_codes: +- en_gumreddit +language_map: + en_gumreddit: en_gumreddit +test_files: + en_gumreddit: corpus/ud-treebanks-v2.7/UD_English-GUMReddit/en_gumreddit-ud-test.conllu +train_files: + en_gumreddit: corpus/ud-treebanks-v2.7/UD_English-GUMReddit/en_gumreddit-ud-train.conllu diff --git a/scripts/train/2.7/treebank/en_lines.yaml b/scripts/train/2.7/treebank/en_lines.yaml new file mode 100644 index 000000000..fecf089fa --- /dev/null +++ b/scripts/train/2.7/treebank/en_lines.yaml @@ -0,0 +1,10 @@ +dev_files: + en_lines: corpus/ud-treebanks-v2.7/UD_English-LinES/en_lines-ud-dev.conllu +language_codes: +- en_lines +language_map: + en_lines: en_lines +test_files: + en_lines: corpus/ud-treebanks-v2.7/UD_English-LinES/en_lines-ud-test.conllu +train_files: + en_lines: corpus/ud-treebanks-v2.7/UD_English-LinES/en_lines-ud-train.conllu diff --git a/scripts/train/2.7/treebank/en_partut.yaml b/scripts/train/2.7/treebank/en_partut.yaml new file mode 100644 index 000000000..07b549a78 --- /dev/null +++ b/scripts/train/2.7/treebank/en_partut.yaml @@ -0,0 +1,10 @@ +dev_files: + en_partut: corpus/ud-treebanks-v2.7/UD_English-ParTUT/en_partut-ud-dev.conllu +language_codes: +- en_partut +language_map: + en_partut: en_partut +test_files: + en_partut: corpus/ud-treebanks-v2.7/UD_English-ParTUT/en_partut-ud-test.conllu +train_files: + en_partut: corpus/ud-treebanks-v2.7/UD_English-ParTUT/en_partut-ud-train.conllu diff --git a/scripts/train/2.7/treebank/es_ancora.yaml b/scripts/train/2.7/treebank/es_ancora.yaml new file mode 100644 index 000000000..def79c43e --- /dev/null +++ b/scripts/train/2.7/treebank/es_ancora.yaml @@ -0,0 +1,10 @@ +dev_files: + es_ancora: corpus/ud-treebanks-v2.7/UD_Spanish-AnCora/es_ancora-ud-dev.conllu +language_codes: +- es_ancora +language_map: + es_ancora: es_ancora +test_files: + es_ancora: corpus/ud-treebanks-v2.7/UD_Spanish-AnCora/es_ancora-ud-test.conllu +train_files: + es_ancora: corpus/ud-treebanks-v2.7/UD_Spanish-AnCora/es_ancora-ud-train.conllu diff --git a/scripts/train/2.7/treebank/es_gsd.yaml b/scripts/train/2.7/treebank/es_gsd.yaml new file mode 100644 index 000000000..fa3e17140 --- /dev/null +++ b/scripts/train/2.7/treebank/es_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + es_gsd: corpus/ud-treebanks-v2.7/UD_Spanish-GSD/es_gsd-ud-dev.conllu +language_codes: +- es_gsd +language_map: + es_gsd: es_gsd +test_files: + es_gsd: corpus/ud-treebanks-v2.7/UD_Spanish-GSD/es_gsd-ud-test.conllu +train_files: + es_gsd: corpus/ud-treebanks-v2.7/UD_Spanish-GSD/es_gsd-ud-train.conllu diff --git a/scripts/train/2.7/treebank/et_edt.yaml b/scripts/train/2.7/treebank/et_edt.yaml new file mode 100644 index 000000000..b611d028a --- /dev/null +++ b/scripts/train/2.7/treebank/et_edt.yaml @@ -0,0 +1,10 @@ +dev_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-dev.conllu +language_codes: +- et_edt +language_map: + et_edt: et_edt +test_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-test.conllu +train_files: + et_edt: corpus/ud-treebanks-v2.7/UD_Estonian-EDT/et_edt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/et_ewt.yaml b/scripts/train/2.7/treebank/et_ewt.yaml new file mode 100644 index 000000000..bf1812efc --- /dev/null +++ b/scripts/train/2.7/treebank/et_ewt.yaml @@ -0,0 +1,10 @@ +dev_files: + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-dev.conllu +language_codes: +- et_ewt +language_map: + et_ewt: et_ewt +test_files: + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-test.conllu +train_files: + et_ewt: corpus/ud-treebanks-v2.7/UD_Estonian-EWT/et_ewt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/eu_bdt.yaml b/scripts/train/2.7/treebank/eu_bdt.yaml new file mode 100644 index 000000000..2b6e04cb7 --- /dev/null +++ b/scripts/train/2.7/treebank/eu_bdt.yaml @@ -0,0 +1,10 @@ +dev_files: + eu_bdt: corpus/ud-treebanks-v2.7/UD_Basque-BDT/eu_bdt-ud-dev.conllu +language_codes: +- eu_bdt +language_map: + eu_bdt: eu_bdt +test_files: + eu_bdt: corpus/ud-treebanks-v2.7/UD_Basque-BDT/eu_bdt-ud-test.conllu +train_files: + eu_bdt: corpus/ud-treebanks-v2.7/UD_Basque-BDT/eu_bdt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fa_perdt.yaml b/scripts/train/2.7/treebank/fa_perdt.yaml new file mode 100644 index 000000000..facc26e6e --- /dev/null +++ b/scripts/train/2.7/treebank/fa_perdt.yaml @@ -0,0 +1,10 @@ +dev_files: + fa_perdt: corpus/ud-treebanks-v2.7/UD_Persian-PerDT/fa_perdt-ud-dev.conllu +language_codes: +- fa_perdt +language_map: + fa_perdt: fa_perdt +test_files: + fa_perdt: corpus/ud-treebanks-v2.7/UD_Persian-PerDT/fa_perdt-ud-test.conllu +train_files: + fa_perdt: corpus/ud-treebanks-v2.7/UD_Persian-PerDT/fa_perdt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fa_seraji.yaml b/scripts/train/2.7/treebank/fa_seraji.yaml new file mode 100644 index 000000000..b14ef27cc --- /dev/null +++ b/scripts/train/2.7/treebank/fa_seraji.yaml @@ -0,0 +1,10 @@ +dev_files: + fa_seraji: corpus/ud-treebanks-v2.7/UD_Persian-Seraji/fa_seraji-ud-dev.conllu +language_codes: +- fa_seraji +language_map: + fa_seraji: fa_seraji +test_files: + fa_seraji: corpus/ud-treebanks-v2.7/UD_Persian-Seraji/fa_seraji-ud-test.conllu +train_files: + fa_seraji: corpus/ud-treebanks-v2.7/UD_Persian-Seraji/fa_seraji-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fi_ftb.yaml b/scripts/train/2.7/treebank/fi_ftb.yaml new file mode 100644 index 000000000..a71ce3352 --- /dev/null +++ b/scripts/train/2.7/treebank/fi_ftb.yaml @@ -0,0 +1,10 @@ +dev_files: + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-dev.conllu +language_codes: +- fi_ftb +language_map: + fi_ftb: fi_ftb +test_files: + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-test.conllu +train_files: + fi_ftb: corpus/ud-treebanks-v2.7/UD_Finnish-FTB/fi_ftb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fi_tdt.yaml b/scripts/train/2.7/treebank/fi_tdt.yaml new file mode 100644 index 000000000..392750bee --- /dev/null +++ b/scripts/train/2.7/treebank/fi_tdt.yaml @@ -0,0 +1,10 @@ +dev_files: + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-dev.conllu +language_codes: +- fi_tdt +language_map: + fi_tdt: fi_tdt +test_files: + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-test.conllu +train_files: + fi_tdt: corpus/ud-treebanks-v2.7/UD_Finnish-TDT/fi_tdt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fo_farpahc.yaml b/scripts/train/2.7/treebank/fo_farpahc.yaml new file mode 100644 index 000000000..fefc8eae0 --- /dev/null +++ b/scripts/train/2.7/treebank/fo_farpahc.yaml @@ -0,0 +1,10 @@ +dev_files: + fo_farpahc: corpus/ud-treebanks-v2.7/UD_Faroese-FarPaHC/fo_farpahc-ud-dev.conllu +language_codes: +- fo_farpahc +language_map: + fo_farpahc: fo_farpahc +test_files: + fo_farpahc: corpus/ud-treebanks-v2.7/UD_Faroese-FarPaHC/fo_farpahc-ud-test.conllu +train_files: + fo_farpahc: corpus/ud-treebanks-v2.7/UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fr_ftb.yaml b/scripts/train/2.7/treebank/fr_ftb.yaml new file mode 100644 index 000000000..2d0e67e0c --- /dev/null +++ b/scripts/train/2.7/treebank/fr_ftb.yaml @@ -0,0 +1,10 @@ +dev_files: + fr_ftb: corpus/ud-treebanks-v2.7/UD_French-FTB/fr_ftb-ud-dev.conllu +language_codes: +- fr_ftb +language_map: + fr_ftb: fr_ftb +test_files: + fr_ftb: corpus/ud-treebanks-v2.7/UD_French-FTB/fr_ftb-ud-test.conllu +train_files: + fr_ftb: corpus/ud-treebanks-v2.7/UD_French-FTB/fr_ftb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fr_gsd.yaml b/scripts/train/2.7/treebank/fr_gsd.yaml new file mode 100644 index 000000000..d54f0d4d1 --- /dev/null +++ b/scripts/train/2.7/treebank/fr_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + fr_gsd: corpus/ud-treebanks-v2.7/UD_French-GSD/fr_gsd-ud-dev.conllu +language_codes: +- fr_gsd +language_map: + fr_gsd: fr_gsd +test_files: + fr_gsd: corpus/ud-treebanks-v2.7/UD_French-GSD/fr_gsd-ud-test.conllu +train_files: + fr_gsd: corpus/ud-treebanks-v2.7/UD_French-GSD/fr_gsd-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fr_partut.yaml b/scripts/train/2.7/treebank/fr_partut.yaml new file mode 100644 index 000000000..2fdbe0091 --- /dev/null +++ b/scripts/train/2.7/treebank/fr_partut.yaml @@ -0,0 +1,10 @@ +dev_files: + fr_partut: corpus/ud-treebanks-v2.7/UD_French-ParTUT/fr_partut-ud-dev.conllu +language_codes: +- fr_partut +language_map: + fr_partut: fr_partut +test_files: + fr_partut: corpus/ud-treebanks-v2.7/UD_French-ParTUT/fr_partut-ud-test.conllu +train_files: + fr_partut: corpus/ud-treebanks-v2.7/UD_French-ParTUT/fr_partut-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fr_sequoia.yaml b/scripts/train/2.7/treebank/fr_sequoia.yaml new file mode 100644 index 000000000..2f86a5d12 --- /dev/null +++ b/scripts/train/2.7/treebank/fr_sequoia.yaml @@ -0,0 +1,10 @@ +dev_files: + fr_sequoia: corpus/ud-treebanks-v2.7/UD_French-Sequoia/fr_sequoia-ud-dev.conllu +language_codes: +- fr_sequoia +language_map: + fr_sequoia: fr_sequoia +test_files: + fr_sequoia: corpus/ud-treebanks-v2.7/UD_French-Sequoia/fr_sequoia-ud-test.conllu +train_files: + fr_sequoia: corpus/ud-treebanks-v2.7/UD_French-Sequoia/fr_sequoia-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fr_spoken.yaml b/scripts/train/2.7/treebank/fr_spoken.yaml new file mode 100644 index 000000000..b8688b225 --- /dev/null +++ b/scripts/train/2.7/treebank/fr_spoken.yaml @@ -0,0 +1,10 @@ +dev_files: + fr_spoken: corpus/ud-treebanks-v2.7/UD_French-Spoken/fr_spoken-ud-dev.conllu +language_codes: +- fr_spoken +language_map: + fr_spoken: fr_spoken +test_files: + fr_spoken: corpus/ud-treebanks-v2.7/UD_French-Spoken/fr_spoken-ud-test.conllu +train_files: + fr_spoken: corpus/ud-treebanks-v2.7/UD_French-Spoken/fr_spoken-ud-train.conllu diff --git a/scripts/train/2.7/treebank/fro_srcmf.yaml b/scripts/train/2.7/treebank/fro_srcmf.yaml new file mode 100644 index 000000000..782d34d43 --- /dev/null +++ b/scripts/train/2.7/treebank/fro_srcmf.yaml @@ -0,0 +1,10 @@ +dev_files: + fro_srcmf: corpus/ud-treebanks-v2.7/UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu +language_codes: +- fro_srcmf +language_map: + fro_srcmf: fro_srcmf +test_files: + fro_srcmf: corpus/ud-treebanks-v2.7/UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu +train_files: + fro_srcmf: corpus/ud-treebanks-v2.7/UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ga_idt.yaml b/scripts/train/2.7/treebank/ga_idt.yaml new file mode 100644 index 000000000..79ea117d9 --- /dev/null +++ b/scripts/train/2.7/treebank/ga_idt.yaml @@ -0,0 +1,10 @@ +dev_files: + ga_idt: corpus/ud-treebanks-v2.7/UD_Irish-IDT/ga_idt-ud-dev.conllu +language_codes: +- ga_idt +language_map: + ga_idt: ga_idt +test_files: + ga_idt: corpus/ud-treebanks-v2.7/UD_Irish-IDT/ga_idt-ud-test.conllu +train_files: + ga_idt: corpus/ud-treebanks-v2.7/UD_Irish-IDT/ga_idt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/gd_arcosg.yaml b/scripts/train/2.7/treebank/gd_arcosg.yaml new file mode 100644 index 000000000..5e446b769 --- /dev/null +++ b/scripts/train/2.7/treebank/gd_arcosg.yaml @@ -0,0 +1,10 @@ +dev_files: + gd_arcosg: corpus/ud-treebanks-v2.7/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu +language_codes: +- gd_arcosg +language_map: + gd_arcosg: gd_arcosg +test_files: + gd_arcosg: corpus/ud-treebanks-v2.7/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu +train_files: + gd_arcosg: corpus/ud-treebanks-v2.7/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu diff --git a/scripts/train/2.7/treebank/gl_ctg.yaml b/scripts/train/2.7/treebank/gl_ctg.yaml new file mode 100644 index 000000000..7b953303d --- /dev/null +++ b/scripts/train/2.7/treebank/gl_ctg.yaml @@ -0,0 +1,10 @@ +dev_files: + gl_ctg: corpus/ud-treebanks-v2.7/UD_Galician-CTG/gl_ctg-ud-dev.conllu +language_codes: +- gl_ctg +language_map: + gl_ctg: gl_ctg +test_files: + gl_ctg: corpus/ud-treebanks-v2.7/UD_Galician-CTG/gl_ctg-ud-test.conllu +train_files: + gl_ctg: corpus/ud-treebanks-v2.7/UD_Galician-CTG/gl_ctg-ud-train.conllu diff --git a/scripts/train/2.7/treebank/gl_treegal.yaml b/scripts/train/2.7/treebank/gl_treegal.yaml new file mode 100644 index 000000000..d8acea7a6 --- /dev/null +++ b/scripts/train/2.7/treebank/gl_treegal.yaml @@ -0,0 +1,10 @@ +dev_files: + gl_treegal: corpus/ud-treebanks-v2.7/UD_Galician-TreeGal/gl_treegal-ud-test.conllu +language_codes: +- gl_treegal +language_map: + gl_treegal: gl_treegal +test_files: + gl_treegal: corpus/ud-treebanks-v2.7/UD_Galician-TreeGal/gl_treegal-ud-test.conllu +train_files: + gl_treegal: corpus/ud-treebanks-v2.7/UD_Galician-TreeGal/gl_treegal-ud-train.conllu diff --git a/scripts/train/2.7/treebank/got_proiel.yaml b/scripts/train/2.7/treebank/got_proiel.yaml new file mode 100644 index 000000000..6e710492c --- /dev/null +++ b/scripts/train/2.7/treebank/got_proiel.yaml @@ -0,0 +1,10 @@ +dev_files: + got_proiel: corpus/ud-treebanks-v2.7/UD_Gothic-PROIEL/got_proiel-ud-dev.conllu +language_codes: +- got_proiel +language_map: + got_proiel: got_proiel +test_files: + got_proiel: corpus/ud-treebanks-v2.7/UD_Gothic-PROIEL/got_proiel-ud-test.conllu +train_files: + got_proiel: corpus/ud-treebanks-v2.7/UD_Gothic-PROIEL/got_proiel-ud-train.conllu diff --git a/scripts/train/2.7/treebank/grc_perseus.yaml b/scripts/train/2.7/treebank/grc_perseus.yaml new file mode 100644 index 000000000..af4ebf642 --- /dev/null +++ b/scripts/train/2.7/treebank/grc_perseus.yaml @@ -0,0 +1,10 @@ +dev_files: + grc_perseus: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu +language_codes: +- grc_perseus +language_map: + grc_perseus: grc_perseus +test_files: + grc_perseus: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu +train_files: + grc_perseus: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu diff --git a/scripts/train/2.7/treebank/grc_proiel.yaml b/scripts/train/2.7/treebank/grc_proiel.yaml new file mode 100644 index 000000000..59532c7a5 --- /dev/null +++ b/scripts/train/2.7/treebank/grc_proiel.yaml @@ -0,0 +1,10 @@ +dev_files: + grc_proiel: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu +language_codes: +- grc_proiel +language_map: + grc_proiel: grc_proiel +test_files: + grc_proiel: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu +train_files: + grc_proiel: corpus/ud-treebanks-v2.7/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu diff --git a/scripts/train/2.7/treebank/he_htb.yaml b/scripts/train/2.7/treebank/he_htb.yaml new file mode 100644 index 000000000..2a5058b60 --- /dev/null +++ b/scripts/train/2.7/treebank/he_htb.yaml @@ -0,0 +1,10 @@ +dev_files: + he_htb: corpus/ud-treebanks-v2.7/UD_Hebrew-HTB/he_htb-ud-dev.conllu +language_codes: +- he_htb +language_map: + he_htb: he_htb +test_files: + he_htb: corpus/ud-treebanks-v2.7/UD_Hebrew-HTB/he_htb-ud-test.conllu +train_files: + he_htb: corpus/ud-treebanks-v2.7/UD_Hebrew-HTB/he_htb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/hi_hdtb.yaml b/scripts/train/2.7/treebank/hi_hdtb.yaml new file mode 100644 index 000000000..8e02ace7a --- /dev/null +++ b/scripts/train/2.7/treebank/hi_hdtb.yaml @@ -0,0 +1,10 @@ +dev_files: + hi_hdtb: corpus/ud-treebanks-v2.7/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu +language_codes: +- hi_hdtb +language_map: + hi_hdtb: hi_hdtb +test_files: + hi_hdtb: corpus/ud-treebanks-v2.7/UD_Hindi-HDTB/hi_hdtb-ud-test.conllu +train_files: + hi_hdtb: corpus/ud-treebanks-v2.7/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/hr_set.yaml b/scripts/train/2.7/treebank/hr_set.yaml new file mode 100644 index 000000000..c96c06368 --- /dev/null +++ b/scripts/train/2.7/treebank/hr_set.yaml @@ -0,0 +1,10 @@ +dev_files: + hr_set: corpus/ud-treebanks-v2.7/UD_Croatian-SET/hr_set-ud-dev.conllu +language_codes: +- hr_set +language_map: + hr_set: hr_set +test_files: + hr_set: corpus/ud-treebanks-v2.7/UD_Croatian-SET/hr_set-ud-test.conllu +train_files: + hr_set: corpus/ud-treebanks-v2.7/UD_Croatian-SET/hr_set-ud-train.conllu diff --git a/scripts/train/2.7/treebank/hsb_ufal.yaml b/scripts/train/2.7/treebank/hsb_ufal.yaml new file mode 100644 index 000000000..92d74ff16 --- /dev/null +++ b/scripts/train/2.7/treebank/hsb_ufal.yaml @@ -0,0 +1,10 @@ +dev_files: + hsb_ufal: corpus/ud-treebanks-v2.7/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu +language_codes: +- hsb_ufal +language_map: + hsb_ufal: hsb_ufal +test_files: + hsb_ufal: corpus/ud-treebanks-v2.7/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu +train_files: + hsb_ufal: corpus/ud-treebanks-v2.7/UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu diff --git a/scripts/train/2.7/treebank/hu_szeged.yaml b/scripts/train/2.7/treebank/hu_szeged.yaml new file mode 100644 index 000000000..26fd79863 --- /dev/null +++ b/scripts/train/2.7/treebank/hu_szeged.yaml @@ -0,0 +1,10 @@ +dev_files: + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu +language_codes: +- hu_szeged +language_map: + hu_szeged: hu_szeged +test_files: + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-test.conllu +train_files: + hu_szeged: corpus/ud-treebanks-v2.7/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu diff --git a/scripts/train/2.7/treebank/hy_armtdp.yaml b/scripts/train/2.7/treebank/hy_armtdp.yaml new file mode 100644 index 000000000..852ef96c0 --- /dev/null +++ b/scripts/train/2.7/treebank/hy_armtdp.yaml @@ -0,0 +1,10 @@ +dev_files: + hy_armtdp: corpus/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu +language_codes: +- hy_armtdp +language_map: + hy_armtdp: hy_armtdp +test_files: + hy_armtdp: corpus/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu +train_files: + hy_armtdp: corpus/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu diff --git a/scripts/train/2.7/treebank/id_csui.yaml b/scripts/train/2.7/treebank/id_csui.yaml new file mode 100644 index 000000000..e7917a308 --- /dev/null +++ b/scripts/train/2.7/treebank/id_csui.yaml @@ -0,0 +1,10 @@ +dev_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-test.conllu +language_codes: +- id_csui +language_map: + id_csui: id_csui +test_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-test.conllu +train_files: + id_csui: corpus/ud-treebanks-v2.7/UD_Indonesian-CSUI/id_csui-ud-train.conllu diff --git a/scripts/train/2.7/treebank/id_gsd.yaml b/scripts/train/2.7/treebank/id_gsd.yaml new file mode 100644 index 000000000..1cec541b7 --- /dev/null +++ b/scripts/train/2.7/treebank/id_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-dev.conllu +language_codes: +- id_gsd +language_map: + id_gsd: id_gsd +test_files: + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-test.conllu +train_files: + id_gsd: corpus/ud-treebanks-v2.7/UD_Indonesian-GSD/id_gsd-ud-train.conllu diff --git a/scripts/train/2.7/treebank/is_icepahc.yaml b/scripts/train/2.7/treebank/is_icepahc.yaml new file mode 100644 index 000000000..0696cea42 --- /dev/null +++ b/scripts/train/2.7/treebank/is_icepahc.yaml @@ -0,0 +1,10 @@ +dev_files: + is_icepahc: corpus/ud-treebanks-v2.7/UD_Icelandic-IcePaHC/is_icepahc-ud-dev.conllu +language_codes: +- is_icepahc +language_map: + is_icepahc: is_icepahc +test_files: + is_icepahc: corpus/ud-treebanks-v2.7/UD_Icelandic-IcePaHC/is_icepahc-ud-test.conllu +train_files: + is_icepahc: corpus/ud-treebanks-v2.7/UD_Icelandic-IcePaHC/is_icepahc-ud-train.conllu diff --git a/scripts/train/2.7/treebank/it_isdt.yaml b/scripts/train/2.7/treebank/it_isdt.yaml new file mode 100644 index 000000000..b8e4336c8 --- /dev/null +++ b/scripts/train/2.7/treebank/it_isdt.yaml @@ -0,0 +1,10 @@ +dev_files: + it_isdt: corpus/ud-treebanks-v2.7/UD_Italian-ISDT/it_isdt-ud-dev.conllu +language_codes: +- it_isdt +language_map: + it_isdt: it_isdt +test_files: + it_isdt: corpus/ud-treebanks-v2.7/UD_Italian-ISDT/it_isdt-ud-test.conllu +train_files: + it_isdt: corpus/ud-treebanks-v2.7/UD_Italian-ISDT/it_isdt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/it_partut.yaml b/scripts/train/2.7/treebank/it_partut.yaml new file mode 100644 index 000000000..7f7952a0a --- /dev/null +++ b/scripts/train/2.7/treebank/it_partut.yaml @@ -0,0 +1,10 @@ +dev_files: + it_partut: corpus/ud-treebanks-v2.7/UD_Italian-ParTUT/it_partut-ud-dev.conllu +language_codes: +- it_partut +language_map: + it_partut: it_partut +test_files: + it_partut: corpus/ud-treebanks-v2.7/UD_Italian-ParTUT/it_partut-ud-test.conllu +train_files: + it_partut: corpus/ud-treebanks-v2.7/UD_Italian-ParTUT/it_partut-ud-train.conllu diff --git a/scripts/train/2.7/treebank/it_postwita.yaml b/scripts/train/2.7/treebank/it_postwita.yaml new file mode 100644 index 000000000..aa6c8fd2c --- /dev/null +++ b/scripts/train/2.7/treebank/it_postwita.yaml @@ -0,0 +1,10 @@ +dev_files: + it_postwita: corpus/ud-treebanks-v2.7/UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu +language_codes: +- it_postwita +language_map: + it_postwita: it_postwita +test_files: + it_postwita: corpus/ud-treebanks-v2.7/UD_Italian-PoSTWITA/it_postwita-ud-test.conllu +train_files: + it_postwita: corpus/ud-treebanks-v2.7/UD_Italian-PoSTWITA/it_postwita-ud-train.conllu diff --git a/scripts/train/2.7/treebank/it_twittiro.yaml b/scripts/train/2.7/treebank/it_twittiro.yaml new file mode 100644 index 000000000..368835542 --- /dev/null +++ b/scripts/train/2.7/treebank/it_twittiro.yaml @@ -0,0 +1,10 @@ +dev_files: + it_twittiro: corpus/ud-treebanks-v2.7/UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu +language_codes: +- it_twittiro +language_map: + it_twittiro: it_twittiro +test_files: + it_twittiro: corpus/ud-treebanks-v2.7/UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu +train_files: + it_twittiro: corpus/ud-treebanks-v2.7/UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu diff --git a/scripts/train/2.7/treebank/it_vit.yaml b/scripts/train/2.7/treebank/it_vit.yaml new file mode 100644 index 000000000..b5a739a4f --- /dev/null +++ b/scripts/train/2.7/treebank/it_vit.yaml @@ -0,0 +1,10 @@ +dev_files: + it_vit: corpus/ud-treebanks-v2.7/UD_Italian-VIT/it_vit-ud-dev.conllu +language_codes: +- it_vit +language_map: + it_vit: it_vit +test_files: + it_vit: corpus/ud-treebanks-v2.7/UD_Italian-VIT/it_vit-ud-test.conllu +train_files: + it_vit: corpus/ud-treebanks-v2.7/UD_Italian-VIT/it_vit-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ja_bccwj.yaml b/scripts/train/2.7/treebank/ja_bccwj.yaml new file mode 100644 index 000000000..7add68f13 --- /dev/null +++ b/scripts/train/2.7/treebank/ja_bccwj.yaml @@ -0,0 +1,10 @@ +dev_files: + ja_bccwj: corpus/ud-treebanks-v2.7/UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu +language_codes: +- ja_bccwj +language_map: + ja_bccwj: ja_bccwj +test_files: + ja_bccwj: corpus/ud-treebanks-v2.7/UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu +train_files: + ja_bccwj: corpus/ud-treebanks-v2.7/UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ja_gsd.yaml b/scripts/train/2.7/treebank/ja_gsd.yaml new file mode 100644 index 000000000..7d0e0789b --- /dev/null +++ b/scripts/train/2.7/treebank/ja_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + ja_gsd: corpus/ud-treebanks-v2.7/UD_Japanese-GSD/ja_gsd-ud-dev.conllu +language_codes: +- ja_gsd +language_map: + ja_gsd: ja_gsd +test_files: + ja_gsd: corpus/ud-treebanks-v2.7/UD_Japanese-GSD/ja_gsd-ud-test.conllu +train_files: + ja_gsd: corpus/ud-treebanks-v2.7/UD_Japanese-GSD/ja_gsd-ud-train.conllu diff --git a/scripts/train/2.7/treebank/kk_ktb.yaml b/scripts/train/2.7/treebank/kk_ktb.yaml new file mode 100644 index 000000000..e49adbb8d --- /dev/null +++ b/scripts/train/2.7/treebank/kk_ktb.yaml @@ -0,0 +1,10 @@ +dev_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +language_codes: +- kk_ktb +language_map: + kk_ktb: kk_ktb +test_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-test.conllu +train_files: + kk_ktb: corpus/ud-treebanks-v2.7/UD_Kazakh-KTB/kk_ktb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/kmr_mg.yaml b/scripts/train/2.7/treebank/kmr_mg.yaml new file mode 100644 index 000000000..d0c16dcc5 --- /dev/null +++ b/scripts/train/2.7/treebank/kmr_mg.yaml @@ -0,0 +1,10 @@ +dev_files: + kmr_mg: corpus/ud-treebanks-v2.7/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +language_codes: +- kmr_mg +language_map: + kmr_mg: kmr_mg +test_files: + kmr_mg: corpus/ud-treebanks-v2.7/UD_Kurmanji-MG/kmr_mg-ud-test.conllu +train_files: + kmr_mg: corpus/ud-treebanks-v2.7/UD_Kurmanji-MG/kmr_mg-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ko_gsd.yaml b/scripts/train/2.7/treebank/ko_gsd.yaml new file mode 100644 index 000000000..e7a1213c7 --- /dev/null +++ b/scripts/train/2.7/treebank/ko_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + ko_gsd: corpus/ud-treebanks-v2.7/UD_Korean-GSD/ko_gsd-ud-dev.conllu +language_codes: +- ko_gsd +language_map: + ko_gsd: ko_gsd +test_files: + ko_gsd: corpus/ud-treebanks-v2.7/UD_Korean-GSD/ko_gsd-ud-test.conllu +train_files: + ko_gsd: corpus/ud-treebanks-v2.7/UD_Korean-GSD/ko_gsd-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ko_kaist.yaml b/scripts/train/2.7/treebank/ko_kaist.yaml new file mode 100644 index 000000000..ed4eff897 --- /dev/null +++ b/scripts/train/2.7/treebank/ko_kaist.yaml @@ -0,0 +1,10 @@ +dev_files: + ko_kaist: corpus/ud-treebanks-v2.7/UD_Korean-Kaist/ko_kaist-ud-dev.conllu +language_codes: +- ko_kaist +language_map: + ko_kaist: ko_kaist +test_files: + ko_kaist: corpus/ud-treebanks-v2.7/UD_Korean-Kaist/ko_kaist-ud-test.conllu +train_files: + ko_kaist: corpus/ud-treebanks-v2.7/UD_Korean-Kaist/ko_kaist-ud-train.conllu diff --git a/scripts/train/2.7/treebank/la_ittb.yaml b/scripts/train/2.7/treebank/la_ittb.yaml new file mode 100644 index 000000000..090f1aea3 --- /dev/null +++ b/scripts/train/2.7/treebank/la_ittb.yaml @@ -0,0 +1,10 @@ +dev_files: + la_ittb: corpus/ud-treebanks-v2.7/UD_Latin-ITTB/la_ittb-ud-dev.conllu +language_codes: +- la_ittb +language_map: + la_ittb: la_ittb +test_files: + la_ittb: corpus/ud-treebanks-v2.7/UD_Latin-ITTB/la_ittb-ud-test.conllu +train_files: + la_ittb: corpus/ud-treebanks-v2.7/UD_Latin-ITTB/la_ittb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/la_llct.yaml b/scripts/train/2.7/treebank/la_llct.yaml new file mode 100644 index 000000000..c7132c23c --- /dev/null +++ b/scripts/train/2.7/treebank/la_llct.yaml @@ -0,0 +1,10 @@ +dev_files: + la_llct: corpus/ud-treebanks-v2.7/UD_Latin-LLCT/la_llct-ud-dev.conllu +language_codes: +- la_llct +language_map: + la_llct: la_llct +test_files: + la_llct: corpus/ud-treebanks-v2.7/UD_Latin-LLCT/la_llct-ud-test.conllu +train_files: + la_llct: corpus/ud-treebanks-v2.7/UD_Latin-LLCT/la_llct-ud-train.conllu diff --git a/scripts/train/2.7/treebank/la_perseus.yaml b/scripts/train/2.7/treebank/la_perseus.yaml new file mode 100644 index 000000000..feda1d208 --- /dev/null +++ b/scripts/train/2.7/treebank/la_perseus.yaml @@ -0,0 +1,10 @@ +dev_files: + la_perseus: corpus/ud-treebanks-v2.7/UD_Latin-Perseus/la_perseus-ud-test.conllu +language_codes: +- la_perseus +language_map: + la_perseus: la_perseus +test_files: + la_perseus: corpus/ud-treebanks-v2.7/UD_Latin-Perseus/la_perseus-ud-test.conllu +train_files: + la_perseus: corpus/ud-treebanks-v2.7/UD_Latin-Perseus/la_perseus-ud-train.conllu diff --git a/scripts/train/2.7/treebank/la_proiel.yaml b/scripts/train/2.7/treebank/la_proiel.yaml new file mode 100644 index 000000000..ab611fb38 --- /dev/null +++ b/scripts/train/2.7/treebank/la_proiel.yaml @@ -0,0 +1,10 @@ +dev_files: + la_proiel: corpus/ud-treebanks-v2.7/UD_Latin-PROIEL/la_proiel-ud-dev.conllu +language_codes: +- la_proiel +language_map: + la_proiel: la_proiel +test_files: + la_proiel: corpus/ud-treebanks-v2.7/UD_Latin-PROIEL/la_proiel-ud-test.conllu +train_files: + la_proiel: corpus/ud-treebanks-v2.7/UD_Latin-PROIEL/la_proiel-ud-train.conllu diff --git a/scripts/train/2.7/treebank/lt_alksnis.yaml b/scripts/train/2.7/treebank/lt_alksnis.yaml new file mode 100644 index 000000000..6a6d9f2b2 --- /dev/null +++ b/scripts/train/2.7/treebank/lt_alksnis.yaml @@ -0,0 +1,10 @@ +dev_files: + lt_alksnis: corpus/ud-treebanks-v2.7/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu +language_codes: +- lt_alksnis +language_map: + lt_alksnis: lt_alksnis +test_files: + lt_alksnis: corpus/ud-treebanks-v2.7/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu +train_files: + lt_alksnis: corpus/ud-treebanks-v2.7/UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu diff --git a/scripts/train/2.7/treebank/lt_hse.yaml b/scripts/train/2.7/treebank/lt_hse.yaml new file mode 100644 index 000000000..a3479dc87 --- /dev/null +++ b/scripts/train/2.7/treebank/lt_hse.yaml @@ -0,0 +1,10 @@ +dev_files: + lt_hse: corpus/ud-treebanks-v2.7/UD_Lithuanian-HSE/lt_hse-ud-dev.conllu +language_codes: +- lt_hse +language_map: + lt_hse: lt_hse +test_files: + lt_hse: corpus/ud-treebanks-v2.7/UD_Lithuanian-HSE/lt_hse-ud-test.conllu +train_files: + lt_hse: corpus/ud-treebanks-v2.7/UD_Lithuanian-HSE/lt_hse-ud-train.conllu diff --git a/scripts/train/2.7/treebank/lv_lvtb.yaml b/scripts/train/2.7/treebank/lv_lvtb.yaml new file mode 100644 index 000000000..b703d21c3 --- /dev/null +++ b/scripts/train/2.7/treebank/lv_lvtb.yaml @@ -0,0 +1,10 @@ +dev_files: + lv_lvtb: corpus/ud-treebanks-v2.7/UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu +language_codes: +- lv_lvtb +language_map: + lv_lvtb: lv_lvtb +test_files: + lv_lvtb: corpus/ud-treebanks-v2.7/UD_Latvian-LVTB/lv_lvtb-ud-test.conllu +train_files: + lv_lvtb: corpus/ud-treebanks-v2.7/UD_Latvian-LVTB/lv_lvtb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/lzh_kyoto.yaml b/scripts/train/2.7/treebank/lzh_kyoto.yaml new file mode 100644 index 000000000..2dc82449e --- /dev/null +++ b/scripts/train/2.7/treebank/lzh_kyoto.yaml @@ -0,0 +1,10 @@ +dev_files: + lzh_kyoto: corpus/ud-treebanks-v2.7/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu +language_codes: +- lzh_kyoto +language_map: + lzh_kyoto: lzh_kyoto +test_files: + lzh_kyoto: corpus/ud-treebanks-v2.7/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu +train_files: + lzh_kyoto: corpus/ud-treebanks-v2.7/UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu diff --git a/scripts/train/2.7/treebank/mr_ufal.yaml b/scripts/train/2.7/treebank/mr_ufal.yaml new file mode 100644 index 000000000..cc0183d1f --- /dev/null +++ b/scripts/train/2.7/treebank/mr_ufal.yaml @@ -0,0 +1,10 @@ +dev_files: + mr_ufal: corpus/ud-treebanks-v2.7/UD_Marathi-UFAL/mr_ufal-ud-dev.conllu +language_codes: +- mr_ufal +language_map: + mr_ufal: mr_ufal +test_files: + mr_ufal: corpus/ud-treebanks-v2.7/UD_Marathi-UFAL/mr_ufal-ud-test.conllu +train_files: + mr_ufal: corpus/ud-treebanks-v2.7/UD_Marathi-UFAL/mr_ufal-ud-train.conllu diff --git a/scripts/train/2.7/treebank/mt_mudt.yaml b/scripts/train/2.7/treebank/mt_mudt.yaml new file mode 100644 index 000000000..29d80075f --- /dev/null +++ b/scripts/train/2.7/treebank/mt_mudt.yaml @@ -0,0 +1,10 @@ +dev_files: + mt_mudt: corpus/ud-treebanks-v2.7/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu +language_codes: +- mt_mudt +language_map: + mt_mudt: mt_mudt +test_files: + mt_mudt: corpus/ud-treebanks-v2.7/UD_Maltese-MUDT/mt_mudt-ud-test.conllu +train_files: + mt_mudt: corpus/ud-treebanks-v2.7/UD_Maltese-MUDT/mt_mudt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/nl_alpino.yaml b/scripts/train/2.7/treebank/nl_alpino.yaml new file mode 100644 index 000000000..f045f3713 --- /dev/null +++ b/scripts/train/2.7/treebank/nl_alpino.yaml @@ -0,0 +1,10 @@ +dev_files: + nl_alpino: corpus/ud-treebanks-v2.7/UD_Dutch-Alpino/nl_alpino-ud-dev.conllu +language_codes: +- nl_alpino +language_map: + nl_alpino: nl_alpino +test_files: + nl_alpino: corpus/ud-treebanks-v2.7/UD_Dutch-Alpino/nl_alpino-ud-test.conllu +train_files: + nl_alpino: corpus/ud-treebanks-v2.7/UD_Dutch-Alpino/nl_alpino-ud-train.conllu diff --git a/scripts/train/2.7/treebank/nl_lassysmall.yaml b/scripts/train/2.7/treebank/nl_lassysmall.yaml new file mode 100644 index 000000000..e8021b29c --- /dev/null +++ b/scripts/train/2.7/treebank/nl_lassysmall.yaml @@ -0,0 +1,10 @@ +dev_files: + nl_lassysmall: corpus/ud-treebanks-v2.7/UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu +language_codes: +- nl_lassysmall +language_map: + nl_lassysmall: nl_lassysmall +test_files: + nl_lassysmall: corpus/ud-treebanks-v2.7/UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu +train_files: + nl_lassysmall: corpus/ud-treebanks-v2.7/UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu diff --git a/scripts/train/2.7/treebank/no_bokmaal.yaml b/scripts/train/2.7/treebank/no_bokmaal.yaml new file mode 100644 index 000000000..81f85de68 --- /dev/null +++ b/scripts/train/2.7/treebank/no_bokmaal.yaml @@ -0,0 +1,10 @@ +dev_files: + no_bokmaal: corpus/ud-treebanks-v2.7/UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu +language_codes: +- no_bokmaal +language_map: + no_bokmaal: no_bokmaal +test_files: + no_bokmaal: corpus/ud-treebanks-v2.7/UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu +train_files: + no_bokmaal: corpus/ud-treebanks-v2.7/UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu diff --git a/scripts/train/2.7/treebank/no_nynorsk.yaml b/scripts/train/2.7/treebank/no_nynorsk.yaml new file mode 100644 index 000000000..a375291be --- /dev/null +++ b/scripts/train/2.7/treebank/no_nynorsk.yaml @@ -0,0 +1,10 @@ +dev_files: + no_nynorsk: corpus/ud-treebanks-v2.7/UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu +language_codes: +- no_nynorsk +language_map: + no_nynorsk: no_nynorsk +test_files: + no_nynorsk: corpus/ud-treebanks-v2.7/UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu +train_files: + no_nynorsk: corpus/ud-treebanks-v2.7/UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu diff --git a/scripts/train/2.7/treebank/no_nynorsklia.yaml b/scripts/train/2.7/treebank/no_nynorsklia.yaml new file mode 100644 index 000000000..1501ce3d5 --- /dev/null +++ b/scripts/train/2.7/treebank/no_nynorsklia.yaml @@ -0,0 +1,10 @@ +dev_files: + no_nynorsklia: corpus/ud-treebanks-v2.7/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu +language_codes: +- no_nynorsklia +language_map: + no_nynorsklia: no_nynorsklia +test_files: + no_nynorsklia: corpus/ud-treebanks-v2.7/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu +train_files: + no_nynorsklia: corpus/ud-treebanks-v2.7/UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu diff --git a/scripts/train/2.7/treebank/olo_kkpp.yaml b/scripts/train/2.7/treebank/olo_kkpp.yaml new file mode 100644 index 000000000..bc8d1ac08 --- /dev/null +++ b/scripts/train/2.7/treebank/olo_kkpp.yaml @@ -0,0 +1,10 @@ +dev_files: + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +language_codes: +- olo_kkpp +language_map: + olo_kkpp: olo_kkpp +test_files: + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-test.conllu +train_files: + olo_kkpp: corpus/ud-treebanks-v2.7/UD_Livvi-KKPP/olo_kkpp-ud-train.conllu diff --git a/scripts/train/2.7/treebank/pcm_nsc.yaml b/scripts/train/2.7/treebank/pcm_nsc.yaml new file mode 100644 index 000000000..812568272 --- /dev/null +++ b/scripts/train/2.7/treebank/pcm_nsc.yaml @@ -0,0 +1,10 @@ +dev_files: + pcm_nsc: corpus/ud-treebanks-v2.7/UD_Naija-NSC/pcm_nsc-ud-dev.conllu +language_codes: +- pcm_nsc +language_map: + pcm_nsc: pcm_nsc +test_files: + pcm_nsc: corpus/ud-treebanks-v2.7/UD_Naija-NSC/pcm_nsc-ud-test.conllu +train_files: + pcm_nsc: corpus/ud-treebanks-v2.7/UD_Naija-NSC/pcm_nsc-ud-train.conllu diff --git a/scripts/train/2.7/treebank/pl_lfg.yaml b/scripts/train/2.7/treebank/pl_lfg.yaml new file mode 100644 index 000000000..52c894e4b --- /dev/null +++ b/scripts/train/2.7/treebank/pl_lfg.yaml @@ -0,0 +1,10 @@ +dev_files: + pl_lfg: corpus/ud-treebanks-v2.7/UD_Polish-LFG/pl_lfg-ud-dev.conllu +language_codes: +- pl_lfg +language_map: + pl_lfg: pl_lfg +test_files: + pl_lfg: corpus/ud-treebanks-v2.7/UD_Polish-LFG/pl_lfg-ud-test.conllu +train_files: + pl_lfg: corpus/ud-treebanks-v2.7/UD_Polish-LFG/pl_lfg-ud-train.conllu diff --git a/scripts/train/2.7/treebank/pl_pdb.yaml b/scripts/train/2.7/treebank/pl_pdb.yaml new file mode 100644 index 000000000..9faa17a03 --- /dev/null +++ b/scripts/train/2.7/treebank/pl_pdb.yaml @@ -0,0 +1,10 @@ +dev_files: + pl_pdb: corpus/ud-treebanks-v2.7/UD_Polish-PDB/pl_pdb-ud-dev.conllu +language_codes: +- pl_pdb +language_map: + pl_pdb: pl_pdb +test_files: + pl_pdb: corpus/ud-treebanks-v2.7/UD_Polish-PDB/pl_pdb-ud-test.conllu +train_files: + pl_pdb: corpus/ud-treebanks-v2.7/UD_Polish-PDB/pl_pdb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/pt_bosque.yaml b/scripts/train/2.7/treebank/pt_bosque.yaml new file mode 100644 index 000000000..43b53adc8 --- /dev/null +++ b/scripts/train/2.7/treebank/pt_bosque.yaml @@ -0,0 +1,10 @@ +dev_files: + pt_bosque: corpus/ud-treebanks-v2.7/UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu +language_codes: +- pt_bosque +language_map: + pt_bosque: pt_bosque +test_files: + pt_bosque: corpus/ud-treebanks-v2.7/UD_Portuguese-Bosque/pt_bosque-ud-test.conllu +train_files: + pt_bosque: corpus/ud-treebanks-v2.7/UD_Portuguese-Bosque/pt_bosque-ud-train.conllu diff --git a/scripts/train/2.7/treebank/pt_gsd.yaml b/scripts/train/2.7/treebank/pt_gsd.yaml new file mode 100644 index 000000000..e60578ac3 --- /dev/null +++ b/scripts/train/2.7/treebank/pt_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + pt_gsd: corpus/ud-treebanks-v2.7/UD_Portuguese-GSD/pt_gsd-ud-dev.conllu +language_codes: +- pt_gsd +language_map: + pt_gsd: pt_gsd +test_files: + pt_gsd: corpus/ud-treebanks-v2.7/UD_Portuguese-GSD/pt_gsd-ud-test.conllu +train_files: + pt_gsd: corpus/ud-treebanks-v2.7/UD_Portuguese-GSD/pt_gsd-ud-train.conllu diff --git a/scripts/train/2.7/treebank/qhe_hiencs.yaml b/scripts/train/2.7/treebank/qhe_hiencs.yaml new file mode 100644 index 000000000..dd14be952 --- /dev/null +++ b/scripts/train/2.7/treebank/qhe_hiencs.yaml @@ -0,0 +1,10 @@ +dev_files: + qhe_hiencs: corpus/ud-treebanks-v2.7/UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu +language_codes: +- qhe_hiencs +language_map: + qhe_hiencs: qhe_hiencs +test_files: + qhe_hiencs: corpus/ud-treebanks-v2.7/UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu +train_files: + qhe_hiencs: corpus/ud-treebanks-v2.7/UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu diff --git a/scripts/train/2.7/treebank/qtd_sagt.yaml b/scripts/train/2.7/treebank/qtd_sagt.yaml new file mode 100644 index 000000000..f332a6ac4 --- /dev/null +++ b/scripts/train/2.7/treebank/qtd_sagt.yaml @@ -0,0 +1,10 @@ +dev_files: + qtd_sagt: corpus/ud-treebanks-v2.7/UD_Turkish_German-SAGT/qtd_sagt-ud-dev.conllu +language_codes: +- qtd_sagt +language_map: + qtd_sagt: qtd_sagt +test_files: + qtd_sagt: corpus/ud-treebanks-v2.7/UD_Turkish_German-SAGT/qtd_sagt-ud-test.conllu +train_files: + qtd_sagt: corpus/ud-treebanks-v2.7/UD_Turkish_German-SAGT/qtd_sagt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ro_nonstandard.yaml b/scripts/train/2.7/treebank/ro_nonstandard.yaml new file mode 100644 index 000000000..4866db000 --- /dev/null +++ b/scripts/train/2.7/treebank/ro_nonstandard.yaml @@ -0,0 +1,10 @@ +dev_files: + ro_nonstandard: corpus/ud-treebanks-v2.7/UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu +language_codes: +- ro_nonstandard +language_map: + ro_nonstandard: ro_nonstandard +test_files: + ro_nonstandard: corpus/ud-treebanks-v2.7/UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu +train_files: + ro_nonstandard: corpus/ud-treebanks-v2.7/UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ro_rrt.yaml b/scripts/train/2.7/treebank/ro_rrt.yaml new file mode 100644 index 000000000..4335ed645 --- /dev/null +++ b/scripts/train/2.7/treebank/ro_rrt.yaml @@ -0,0 +1,10 @@ +dev_files: + ro_rrt: corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-dev.conllu +language_codes: +- ro_rrt +language_map: + ro_rrt: ro_rrt +test_files: + ro_rrt: corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-test.conllu +train_files: + ro_rrt: corpus/ud-treebanks-v2.7/UD_Romanian-RRT/ro_rrt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ro_simonero.yaml b/scripts/train/2.7/treebank/ro_simonero.yaml new file mode 100644 index 000000000..2ce6af2e5 --- /dev/null +++ b/scripts/train/2.7/treebank/ro_simonero.yaml @@ -0,0 +1,10 @@ +dev_files: + ro_simonero: corpus/ud-treebanks-v2.7/UD_Romanian-SiMoNERo/ro_simonero-ud-dev.conllu +language: + ro_simonero: ro_simonero +language2id: +- ro_simonero +test_files: + ro_simonero: corpus/ud-treebanks-v2.7/UD_Romanian-SiMoNERo/ro_simonero-ud-test.conllu +train_files: + ro_simonero: corpus/ud-treebanks-v2.7/UD_Romanian-SiMoNERo/ro_simonero-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ru_gsd.yaml b/scripts/train/2.7/treebank/ru_gsd.yaml new file mode 100644 index 000000000..18945d5d0 --- /dev/null +++ b/scripts/train/2.7/treebank/ru_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + ru_gsd: corpus/ud-treebanks-v2.7/UD_Russian-GSD/ru_gsd-ud-dev.conllu +language_codes: +- ru_gsd +language_map: + ru_gsd: ru_gsd +test_files: + ru_gsd: corpus/ud-treebanks-v2.7/UD_Russian-GSD/ru_gsd-ud-test.conllu +train_files: + ru_gsd: corpus/ud-treebanks-v2.7/UD_Russian-GSD/ru_gsd-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ru_syntagrus.yaml b/scripts/train/2.7/treebank/ru_syntagrus.yaml new file mode 100644 index 000000000..91c9e68e5 --- /dev/null +++ b/scripts/train/2.7/treebank/ru_syntagrus.yaml @@ -0,0 +1,10 @@ +dev_files: + ru_syntagrus: corpus/ud-treebanks-v2.7/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu +language_codes: +- ru_syntagrus +language_map: + ru_syntagrus: ru_syntagrus +test_files: + ru_syntagrus: corpus/ud-treebanks-v2.7/UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu +train_files: + ru_syntagrus: corpus/ud-treebanks-v2.7/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ru_taiga.yaml b/scripts/train/2.7/treebank/ru_taiga.yaml new file mode 100644 index 000000000..ba185c88c --- /dev/null +++ b/scripts/train/2.7/treebank/ru_taiga.yaml @@ -0,0 +1,10 @@ +dev_files: + ru_taiga: corpus/ud-treebanks-v2.7/UD_Russian-Taiga/ru_taiga-ud-dev.conllu +language_codes: +- ru_taiga +language_map: + ru_taiga: ru_taiga +test_files: + ru_taiga: corpus/ud-treebanks-v2.7/UD_Russian-Taiga/ru_taiga-ud-test.conllu +train_files: + ru_taiga: corpus/ud-treebanks-v2.7/UD_Russian-Taiga/ru_taiga-ud-train.conllu diff --git a/scripts/train/2.7/treebank/sa_vedic.yaml b/scripts/train/2.7/treebank/sa_vedic.yaml new file mode 100644 index 000000000..65adc7801 --- /dev/null +++ b/scripts/train/2.7/treebank/sa_vedic.yaml @@ -0,0 +1,10 @@ +dev_files: + sa_vedic: corpus/ud-treebanks-v2.7/UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu +language_codes: +- sa_vedic +language_map: + sa_vedic: sa_vedic +test_files: + sa_vedic: corpus/ud-treebanks-v2.7/UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu +train_files: + sa_vedic: corpus/ud-treebanks-v2.7/UD_Sanskrit-Vedic/sa_vedic-ud-train.conllu diff --git a/scripts/train/2.7/treebank/sk_snk.yaml b/scripts/train/2.7/treebank/sk_snk.yaml new file mode 100644 index 000000000..53f0c2d09 --- /dev/null +++ b/scripts/train/2.7/treebank/sk_snk.yaml @@ -0,0 +1,10 @@ +dev_files: + sk_snk: corpus/ud-treebanks-v2.7/UD_Slovak-SNK/sk_snk-ud-dev.conllu +language_codes: +- sk_snk +language_map: + sk_snk: sk_snk +test_files: + sk_snk: corpus/ud-treebanks-v2.7/UD_Slovak-SNK/sk_snk-ud-test.conllu +train_files: + sk_snk: corpus/ud-treebanks-v2.7/UD_Slovak-SNK/sk_snk-ud-train.conllu diff --git a/scripts/train/2.7/treebank/sl_ssj.yaml b/scripts/train/2.7/treebank/sl_ssj.yaml new file mode 100644 index 000000000..81c2e30a2 --- /dev/null +++ b/scripts/train/2.7/treebank/sl_ssj.yaml @@ -0,0 +1,10 @@ +dev_files: + sl_ssj: corpus/ud-treebanks-v2.7/UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu +language_codes: +- sl_ssj +language_map: + sl_ssj: sl_ssj +test_files: + sl_ssj: corpus/ud-treebanks-v2.7/UD_Slovenian-SSJ/sl_ssj-ud-test.conllu +train_files: + sl_ssj: corpus/ud-treebanks-v2.7/UD_Slovenian-SSJ/sl_ssj-ud-train.conllu diff --git a/scripts/train/2.7/treebank/sl_sst.yaml b/scripts/train/2.7/treebank/sl_sst.yaml new file mode 100644 index 000000000..dc48f16ee --- /dev/null +++ b/scripts/train/2.7/treebank/sl_sst.yaml @@ -0,0 +1,10 @@ +dev_files: + sl_sst: corpus/ud-treebanks-v2.7/UD_Slovenian-SST/sl_sst-ud-test.conllu +language_codes: +- sl_sst +language_map: + sl_sst: sl_sst +test_files: + sl_sst: corpus/ud-treebanks-v2.7/UD_Slovenian-SST/sl_sst-ud-test.conllu +train_files: + sl_sst: corpus/ud-treebanks-v2.7/UD_Slovenian-SST/sl_sst-ud-train.conllu diff --git a/scripts/train/2.7/treebank/sme_giella.yaml b/scripts/train/2.7/treebank/sme_giella.yaml new file mode 100644 index 000000000..c5b3e4cf1 --- /dev/null +++ b/scripts/train/2.7/treebank/sme_giella.yaml @@ -0,0 +1,10 @@ +dev_files: + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-test.conllu +language_codes: +- sme_giella +language_map: + sme_giella: sme_giella +test_files: + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-test.conllu +train_files: + sme_giella: corpus/ud-treebanks-v2.7/UD_North_Sami-Giella/sme_giella-ud-train.conllu diff --git a/scripts/train/2.7/treebank/sr_set.yaml b/scripts/train/2.7/treebank/sr_set.yaml new file mode 100644 index 000000000..f79bb6cf8 --- /dev/null +++ b/scripts/train/2.7/treebank/sr_set.yaml @@ -0,0 +1,10 @@ +dev_files: + sr_set: corpus/ud-treebanks-v2.7/UD_Serbian-SET/sr_set-ud-dev.conllu +language_codes: +- sr_set +language_map: + sr_set: sr_set +test_files: + sr_set: corpus/ud-treebanks-v2.7/UD_Serbian-SET/sr_set-ud-test.conllu +train_files: + sr_set: corpus/ud-treebanks-v2.7/UD_Serbian-SET/sr_set-ud-train.conllu diff --git a/scripts/train/2.7/treebank/sv_lines.yaml b/scripts/train/2.7/treebank/sv_lines.yaml new file mode 100644 index 000000000..d2e0226ef --- /dev/null +++ b/scripts/train/2.7/treebank/sv_lines.yaml @@ -0,0 +1,10 @@ +dev_files: + sv_lines: corpus/ud-treebanks-v2.7/UD_Swedish-LinES/sv_lines-ud-dev.conllu +language_codes: +- sv_lines +language_map: + sv_lines: sv_lines +test_files: + sv_lines: corpus/ud-treebanks-v2.7/UD_Swedish-LinES/sv_lines-ud-test.conllu +train_files: + sv_lines: corpus/ud-treebanks-v2.7/UD_Swedish-LinES/sv_lines-ud-train.conllu diff --git a/scripts/train/2.7/treebank/sv_talbanken.yaml b/scripts/train/2.7/treebank/sv_talbanken.yaml new file mode 100644 index 000000000..158ab5cf3 --- /dev/null +++ b/scripts/train/2.7/treebank/sv_talbanken.yaml @@ -0,0 +1,10 @@ +dev_files: + sv_talbanken: corpus/ud-treebanks-v2.7/UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu +language_codes: +- sv_talbanken +language_map: + sv_talbanken: sv_talbanken +test_files: + sv_talbanken: corpus/ud-treebanks-v2.7/UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu +train_files: + sv_talbanken: corpus/ud-treebanks-v2.7/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu diff --git a/scripts/train/2.7/treebank/swl_sslc.yaml b/scripts/train/2.7/treebank/swl_sslc.yaml new file mode 100644 index 000000000..1ce0cef2b --- /dev/null +++ b/scripts/train/2.7/treebank/swl_sslc.yaml @@ -0,0 +1,10 @@ +dev_files: + swl_sslc: corpus/ud-treebanks-v2.7/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu +language_codes: +- swl_sslc +language_map: + swl_sslc: swl_sslc +test_files: + swl_sslc: corpus/ud-treebanks-v2.7/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu +train_files: + swl_sslc: corpus/ud-treebanks-v2.7/UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ta_ttb.yaml b/scripts/train/2.7/treebank/ta_ttb.yaml new file mode 100644 index 000000000..85e120cb4 --- /dev/null +++ b/scripts/train/2.7/treebank/ta_ttb.yaml @@ -0,0 +1,10 @@ +dev_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-dev.conllu +language_codes: +- ta_ttb +language_map: + ta_ttb: ta_ttb +test_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-test.conllu +train_files: + ta_ttb: corpus/ud-treebanks-v2.7/UD_Tamil-TTB/ta_ttb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/te_mtg.yaml b/scripts/train/2.7/treebank/te_mtg.yaml new file mode 100644 index 000000000..effbea412 --- /dev/null +++ b/scripts/train/2.7/treebank/te_mtg.yaml @@ -0,0 +1,10 @@ +dev_files: + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-dev.conllu +language_codes: +- te_mtg +language_map: + te_mtg: te_mtg +test_files: + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-test.conllu +train_files: + te_mtg: corpus/ud-treebanks-v2.7/UD_Telugu-MTG/te_mtg-ud-train.conllu diff --git a/scripts/train/2.7/treebank/tr_boun.yaml b/scripts/train/2.7/treebank/tr_boun.yaml new file mode 100644 index 000000000..f79df8d7f --- /dev/null +++ b/scripts/train/2.7/treebank/tr_boun.yaml @@ -0,0 +1,10 @@ +dev_files: + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-dev.conllu +language_codes: +- tr_boun +language_map: + tr_boun: tr_boun +test_files: + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-test.conllu +train_files: + tr_boun: corpus/ud-treebanks-v2.7/UD_Turkish-BOUN/tr_boun-ud-train.conllu diff --git a/scripts/train/2.7/treebank/tr_imst.yaml b/scripts/train/2.7/treebank/tr_imst.yaml new file mode 100644 index 000000000..a7e13e730 --- /dev/null +++ b/scripts/train/2.7/treebank/tr_imst.yaml @@ -0,0 +1,10 @@ +dev_files: + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-dev.conllu +language_codes: +- tr_imst +language_map: + tr_imst: tr_imst +test_files: + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-test.conllu +train_files: + tr_imst: corpus/ud-treebanks-v2.7/UD_Turkish-IMST/tr_imst-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ug_udt.yaml b/scripts/train/2.7/treebank/ug_udt.yaml new file mode 100644 index 000000000..477924459 --- /dev/null +++ b/scripts/train/2.7/treebank/ug_udt.yaml @@ -0,0 +1,10 @@ +dev_files: + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-dev.conllu +language_codes: +- ug_udt +language_map: + ug_udt: ug_udt +test_files: + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-test.conllu +train_files: + ug_udt: corpus/ud-treebanks-v2.7/UD_Uyghur-UDT/ug_udt-ud-train.conllu diff --git a/scripts/train/2.7/treebank/uk_iu.yaml b/scripts/train/2.7/treebank/uk_iu.yaml new file mode 100644 index 000000000..90782af1f --- /dev/null +++ b/scripts/train/2.7/treebank/uk_iu.yaml @@ -0,0 +1,10 @@ +dev_files: + uk_iu: corpus/ud-treebanks-v2.7/UD_Ukrainian-IU/uk_iu-ud-dev.conllu +language_codes: +- uk_iu +language_map: + uk_iu: uk_iu +test_files: + uk_iu: corpus/ud-treebanks-v2.7/UD_Ukrainian-IU/uk_iu-ud-test.conllu +train_files: + uk_iu: corpus/ud-treebanks-v2.7/UD_Ukrainian-IU/uk_iu-ud-train.conllu diff --git a/scripts/train/2.7/treebank/ur_udtb.yaml b/scripts/train/2.7/treebank/ur_udtb.yaml new file mode 100644 index 000000000..516dffc1c --- /dev/null +++ b/scripts/train/2.7/treebank/ur_udtb.yaml @@ -0,0 +1,10 @@ +dev_files: + ur_udtb: corpus/ud-treebanks-v2.7/UD_Urdu-UDTB/ur_udtb-ud-dev.conllu +language_codes: +- ur_udtb +language_map: + ur_udtb: ur_udtb +test_files: + ur_udtb: corpus/ud-treebanks-v2.7/UD_Urdu-UDTB/ur_udtb-ud-test.conllu +train_files: + ur_udtb: corpus/ud-treebanks-v2.7/UD_Urdu-UDTB/ur_udtb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/vi_vtb.yaml b/scripts/train/2.7/treebank/vi_vtb.yaml new file mode 100644 index 000000000..572f6718a --- /dev/null +++ b/scripts/train/2.7/treebank/vi_vtb.yaml @@ -0,0 +1,10 @@ +dev_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu +language_codes: +- vi_vtb +language_map: + vi_vtb: vi_vtb +test_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-test.conllu +train_files: + vi_vtb: corpus/ud-treebanks-v2.7/UD_Vietnamese-VTB/vi_vtb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/wo_wtb.yaml b/scripts/train/2.7/treebank/wo_wtb.yaml new file mode 100644 index 000000000..eb3372b58 --- /dev/null +++ b/scripts/train/2.7/treebank/wo_wtb.yaml @@ -0,0 +1,10 @@ +dev_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-dev.conllu +language_codes: +- wo_wtb +language_map: + wo_wtb: wo_wtb +test_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-test.conllu +train_files: + wo_wtb: corpus/ud-treebanks-v2.7/UD_Wolof-WTB/wo_wtb-ud-train.conllu diff --git a/scripts/train/2.7/treebank/zh_gsd.yaml b/scripts/train/2.7/treebank/zh_gsd.yaml new file mode 100644 index 000000000..cbba359e3 --- /dev/null +++ b/scripts/train/2.7/treebank/zh_gsd.yaml @@ -0,0 +1,10 @@ +dev_files: + zh_gsd: corpus/ud-treebanks-v2.7/UD_Chinese-GSD/zh_gsd-ud-dev.conllu +language_codes: +- zh_gsd +language_map: + zh_gsd: zh_gsd +test_files: + zh_gsd: corpus/ud-treebanks-v2.7/UD_Chinese-GSD/zh_gsd-ud-test.conllu +train_files: + zh_gsd: corpus/ud-treebanks-v2.7/UD_Chinese-GSD/zh_gsd-ud-train.conllu diff --git a/scripts/train/2.7/treebank/zh_gsdsimp.yaml b/scripts/train/2.7/treebank/zh_gsdsimp.yaml new file mode 100644 index 000000000..112888d9b --- /dev/null +++ b/scripts/train/2.7/treebank/zh_gsdsimp.yaml @@ -0,0 +1,10 @@ +dev_files: + zh_gsdsimp: corpus/ud-treebanks-v2.7/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu +language_codes: +- zh_gsdsimp +language_map: + zh_gsdsimp: zh_gsdsimp +test_files: + zh_gsdsimp: corpus/ud-treebanks-v2.7/UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu +train_files: + zh_gsdsimp: corpus/ud-treebanks-v2.7/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu diff --git a/scripts/train/multi_train.py b/scripts/train/multi_train.py new file mode 100644 index 000000000..b552c8e9f --- /dev/null +++ b/scripts/train/multi_train.py @@ -0,0 +1,114 @@ +import json, os, traceback +from argparse import ArgumentParser +import pytorch_lightning as pl +from pprint import pprint +import subprocess + +parser = ArgumentParser(description='Multiple language trainer') +parser.add_argument('--task', action='store', dest='task', + help='Type of task : "tokenizer", "lemmatizer", "cwe", "tagger", "parser"') +parser.add_argument('--train', action='store', dest='train_file', + help='Start building a tagger model') +parser.add_argument('--patience', action='store', type=int, default=20, dest='patience', + help='Number of epochs before early stopping (default=20)') +parser.add_argument('--store_folder', action='store', dest='store_folder', help='Output folder') +parser.add_argument('--num-workers', action='store', dest='num_workers', type=int, + help='How many dataloaders to use (default=4)', default=4) +parser.add_argument('--batch-size', action='store', type=int, default=16, dest='batch_size', + help='Batch size (default=16)') +parser.add_argument('--debug', action='store_true', dest='debug', + help='Do some standard stuff to debug the model') +parser.add_argument('--resume', action='store_true', dest='resume', help='Resume training') +parser.add_argument('--lm-model', action='store', dest='lm_model', + help='What LM model to use (default=xlm-roberta-base)') +parser.add_argument('--lm-device', action='store', dest='lm_device', default='cuda:0', + help='Where to load LM (default=cuda:0)') +parser.add_argument('--config', action='store', dest='config_file', help='Load config file') + +parser = pl.Trainer.add_argparse_args(parser) # add all pytorch lightning params here as well + +parser.add_argument('--force_all', action='store_true', dest='force_all', help='Overwrite everything') +parser.add_argument('--retry_failed', action='store_true', dest='retry_failed', help='Retry unfinished training runs') +parser.add_argument('--yaml_folder', action='store', dest='yaml_folder', help='Where the yaml config files are stored') +parser.add_argument('--suffix', action='store', dest='suffix', help='Model name suffix') +args = parser.parse_args() + +print(args) + +print(f"Running {args.task} on {args.yaml_folder}") + +# load training status +if os.path.exists(f"{args.task}-status.json"): + with open(f"{args.task}-status.json", "r") as f: + jobs = json.load(f) +else: + jobs = {} + +# list all yaml files in folder +files = os.listdir(args.yaml_folder) +yamls = {} +for file in files: + if ".yaml" in file: + yamls[file.replace(".yaml","")] = os.path.join(args.yaml_folder, file) + +for yaml in yamls: + do_it = False + # skip when not force_all, or when it is failed and rety_failed is True + if yaml in jobs and jobs[yaml]!="Done" and args.retry_failed is True: + do_it = True + if args.force_all == True: + do_it = True + if yaml not in jobs: + do_it = True + if not do_it: + print(f"Skipping job {yaml} ...\n") + continue + + store = os.path.join(args.store_folder, f"{yaml}-{args.suffix}-{args.task}") + print("\n\n") + print("_"*80) + print(f"Running job {yaml} with model store = {store}") + + jobs[yaml]="Training" + with open(f"{args.task}-status.json", "w") as f: + json.dump(jobs, f) + try: + cmd = ["python3", "cube/trainer.py", + "--task", f"{args.task}", + "--train", f"{yamls[yaml]}", + "--store", f"{store}", + "--gpus", f"{args.gpus}", + "--num-workers", f"{args.num_workers}", + "--accelerator", f"{args.accelerator}", + "--batch-size", f"{args.batch_size}", + "--accumulate_grad_batches", f"{args.accumulate_grad_batches}", + "--deterministic", f"{args.deterministic}", + "--num_nodes", f"{args.num_nodes}", + "--num_processes", f"{args.num_processes}", + "--precision", f"{args.precision}", + #"--tpu_cores", f"{args.tpu_cores}", + #"--resume", f"{args.resume}", + "--lm-model", f"{args.lm_model}", + "--lm-device", f"{args.lm_device}"] + print("\n" + " ".join(cmd).strip()) + subprocess.run(cmd, check=True) + + jobs[yaml] = "Done" + with open(f"{args.task}-status.json", "w") as f: + json.dump(jobs, f) + + except Exception as ex: + print(f"Job has failed: {ex}") + #traceback.print_stack() + import sys + exc_info = sys.exc_info() + traceback.print_exception(*exc_info) + del exc_info + + + + + + + + diff --git a/scripts/ud_table.html b/scripts/ud_table.html new file mode 100644 index 000000000..9da15f9dc --- /dev/null +++ b/scripts/ud_table.html @@ -0,0 +1,8736 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +

Ancient Greek treebanks

+ +
+ + +
+ + UD_Ancient_Greek-PROIEL is converted from the Ancient Greek data in the PROIEL treebank, and consists of the New Testament plus selections from Herodotus. + + + +

 

+
+ + + + + +
+ + + See here for comparative statistics of Ancient Greek treebanks. + + +

Language documentation

+ + + The language hub documentation has not yet been created or ported from the UDv1 documentation. + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
\ No newline at end of file diff --git a/setup.py b/setup.py index 4e325514a..ef4b23d23 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ def parse_requirements(filename, session=None): setuptools.setup( name="nlpcube", - version="0.1.0.8", + version="0.3.1.0", author="Multiple authors", author_email="tiberiu44@gmail.com", description="Natural Language Processing Toolkit with support for tokenization, sentence splitting, lemmatization, tagging and parsing for more than 60 languages", diff --git a/test.py b/test.py new file mode 100644 index 000000000..0590afa45 --- /dev/null +++ b/test.py @@ -0,0 +1,76 @@ +import sys + +sys.path.append('') +import torch +from cube.networks.tokenizer import Tokenizer +from cube.networks.tagger import Tagger +from cube.io_utils.config import TokenizerConfig, TaggerConfig, ParserConfig +from cube.io_utils.encodings import Encodings +from cube.networks.utils_tokenizer import TokenCollateFTLanguasito, TokenCollateHF +from cube.networks.lm import LMHelperFT, LMHelperHF +from cube.networks.utils import MorphoCollate +from cube.networks.parser import Parser + +# # tokenizer +# enc = Encodings() +# enc.load('data/tokenizer-ro-transformer.encodings') +# +# config = TokenizerConfig() +# tokenizer = Tokenizer(config, enc, language_codes=['ro_nonstandard', 'ro_rrt'], ext_word_emb=[768 for _ in range(13)]) +# +# model = torch.load('data/tokenizer-ro-transformer.ro_rrt.sent', map_location='cpu') +# +# tokenizer.load_state_dict(model['state_dict']) +# +# collate = TokenCollateHF(enc, lm_model='xlm-roberta-base', lm_device='cpu') +# text = open('corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-test.txt').read() +# # text = 'Și eu am mere. Ana are mere, dar nu are pere. Acesta este un test.' +# d = tokenizer.process(text, collate, lang_id=1, batch_size=4) +# tokenizer +enc = Encodings() +enc.load('data/tokenizer-ro-fasttext.encodings') + +config = TokenizerConfig() +tokenizer = Tokenizer(config, enc, language_codes=['ro_rrt'], ext_word_emb=[300]) + +model = torch.load('data/tokenizer-ro-fasttext.ro_rrt.sent', map_location='cpu') + +tokenizer.load_state_dict(model['state_dict']) + +collate = TokenCollateFTLanguasito(enc, lm_model='fasttext:ro', lm_device='cpu') +text = open('corpus/ud-treebanks-v2.5/UD_Romanian-RRT/ro_rrt-ud-test.txt').read() +# text = 'Și eu am mere. Ana are mere, dar nu are pere. Acesta este un test.' +d = tokenizer.process(text, collate, lang_id=0, batch_size=4) +for ii in range(len(d.sentences)): + d.sentences[ii].lang_id = 1 + +# helper = LMHelperFT(model='ro') +# helper.apply(d) + +# # tagger +# enc = Encodings() +# enc.load('data/tagger-ro-fasttext.encodings') +# model = torch.load('data/tagger-ro-fasttext.ro_rrt.upos', map_location='cpu') +# config = TaggerConfig() +# config.load('data/tagger-ro-fasttext.config') +# tagger = Tagger(config, enc, ext_word_emb=helper.get_embedding_size(), language_codes=['ro_nonstandard', 'ro_rrt']) +# tagger.load_state_dict(model['state_dict']) +# collate = MorphoCollate(enc) +# d = tagger.process(d, collate) + +# parser +# del helper +helper = LMHelperHF(model='xlm-roberta-base') +helper.apply(d) +enc = Encodings() +enc.load('data/parser-ro-transformer.encodings') +collate = MorphoCollate(enc) +model = torch.load('data/parser-ro-transformer.ro_rrt.uas', map_location='cpu') +config = ParserConfig() +config.load('data/parser-ro-transformer.config') +parser = Parser(config, enc, ext_word_emb=helper.get_embedding_size(), language_codes=['ro_nonstandard', 'ro_rrt']) +parser.load_state_dict(model['state_dict']) +d = parser.process(d, collate) + +print(d) +print("") diff --git a/tests/.gitignore b/tests/.gitignore deleted file mode 100644 index a9f2fb3bd..000000000 --- a/tests/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -scratch/ -scratch/* -my_model-1.0/ -*.zip \ No newline at end of file diff --git a/tests/README.md b/tests/README.md deleted file mode 100644 index 4d3a97c8f..000000000 --- a/tests/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# NLP-Cube Tests - -To perform automatic testing, simply run (from the main folder): - - -``` -python3 tests/main_test.py -``` - - -and - - -``` -python3 tests/api_test.py -``` - - -Please run them in this sequence as ``main_test.py`` creates a local model that ``api_test.py`` expects to find. - - diff --git a/tests/api_tests.py b/tests/api_tests.py deleted file mode 100644 index bd476f307..000000000 --- a/tests/api_tests.py +++ /dev/null @@ -1,282 +0,0 @@ -""" -This class should test: - -0. Init the model_store object and list online models -1. Download and run an online model -2. Run a local model (should be created with main_tests.py before in tests/my_model-1.0) -3.1. Package a local model *without* embeddings link set in metadata.json -3.2. Import it into NLP-Cube -3.3. Run the local model with manual embeddings - -4.1. Package a local model *with* embeddings link set in metadata.json -4.2. Import it into NLP-Cube -4.3. Run the local model without manual embeddings - -""" -import os, sys, subprocess -import unittest - -class Api_Tests(unittest.TestCase): - - def setUp(self): - # get current directory - self.root_path = os.path.dirname(os.path.realpath(__file__)) - self.root_path = os.path.abspath(os.path.join(self.root_path, os.pardir)) - self.main_file_path = os.path.join(self.root_path, "cube", "main.py") - self.scripts_path = os.path.join(self.root_path, "scripts") - self.corpus_path = os.path.join(self.root_path, "tests", "test_corpus") - self.model_path = os.path.join(self.root_path, "tests", "my_model-1.0") - self.local_model_repo = os.path.join(self.root_path, "tests") - self.scratch_path = os.path.join(self.root_path, "tests", "scratch") - self.input_file_path = os.path.join(self.corpus_path, "en_ewt-ud-test.txt") - self.output_file_path = os.path.join(self.scratch_path, "en_ewt-ud-test-output.conllu") - - if not os.path.exists(self.model_path): - os.makedirs(self.model_path) - if not os.path.exists(self.scratch_path): - os.makedirs(self.scratch_path) - - #import root_path - sys.path.append(self.root_path) - - #print("[setUp] Absolute path of NLP-Cube: "+self.root_path) - #print() - - - def test_0_init_model_store_and_list_online_models(self): - print("\n\33[33m{}\33[0m".format("0. Loading the model store and querying the online database ...")) - from cube.io_utils.model_store import ModelMetadata, ModelStore - model_store_object = ModelStore() - online_models = model_store_object.list_online_models() - print("Found "+str(len(online_models))+ " models online.") - self.assertTrue(len(online_models)>0) - - def test_1_1_download_and_run_an_online_model_latest_version(self): - print("\n\33[33m{}\33[0m".format("1.1 Loading an online model (latest_version) ...")) - from cube.api import Cube - cube = Cube(verbose=True) - #cube.load('en_small', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) - cube.load('bxr', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) - cube.metadata.info() - text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." - sentences = cube(text) - self.assertTrue(len(sentences)>0) - self.assertTrue(len(sentences[0])>0) - - def test_1_2_download_and_run_an_online_model_specific_version(self): - print("\n\33[33m{}\33[0m".format("1.2. Loading an online model (sme, 1.0) ...")) - from cube.api import Cube - cube = Cube(verbose=True) - cube.load('sme', version='1.0', tokenization=True, compound_word_expanding=False, tagging=False, lemmatization=False, parsing=False) - cube.metadata.info() - text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." - sentences = cube(text) - self.assertTrue(len(sentences)>0) - self.assertTrue(len(sentences[0])>0) - - # This test needs my_model-1.0 to be locally created with main_tests.py - def test_2_run_a_local_model(self): - print("\n\33[33m{}\33[0m".format("2. Run a local model that does not have embeddings or metadata (running with dummy.vec embeddings) ...")) - embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec") - from cube.api import Cube - cube = Cube(verbose=True) - cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_models_repository=self.local_model_repo, local_embeddings_file=embeddings) - text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." - sentences = cube(text) - self.assertTrue(len(sentences)>0) - self.assertTrue(len(sentences[0])>0) - - - def test_3_1_package_a_local_model_without_embeddings_link_in_metadata(self): - print("\n\33[33m{}\33[0m".format("3.1. Package a local model without an embeddings file ...")) - - # create metadata file - with open(os.path.join(self.model_path,"metadata.json"),"w",encoding="utf-8") as f: - f.write("{\n") - f.write('"embeddings_file_name": "wiki.dummy.vec",\n') - f.write('"embeddings_remote_link": "",\n') - f.write('"language": "UD_English",\n') - f.write('"language_code": "my_model",\n') - f.write('"model_build_date": "2020-01-01",\n') - f.write('"model_build_source": "UD_English-ParTuT",\n') - f.write('"model_version": 1.0,\n') - f.write('"notes": "Source: ud-treebanks-v2.2, dummy model",\n') - f.write('"token_delimiter": " "\n') - f.write("}\n") - - #python3 /work/NLP-Cube/scripts/export_model.py /work/my_model-1.0 --tokenizer --tagger - command = "python3 " + os.path.join(self.scripts_path, "export_model.py") + " " + self.model_path - command+= " --tokenizer --tagger --parser --lemmatizer" - print("\n\t\t\33[32m{}\n{}\33[0m".format("Export command:",command)) - ''' popen = subprocess.Popen(command.split(" "), stdout=subprocess.PIPE, universal_newlines=True) - output = [] - for stdout_line in iter(popen.stdout.readline, ""): - print(stdout_line[:-1]) - if stdout_line.strip()!= "": - output.append(stdout_line[:-1]) - popen.stdout.close() - return_code = popen.wait() - ''' - os.system(command) - - test = os.path.exists(os.path.join(self.local_model_repo,"my_model-1.0.zip")) - self.assertTrue(test) - - def test_3_2_import_model_in_store(self): - print("\n\33[33m{}\33[0m".format("3.2. Import locally created model in store ...")) - command = "python3 " + os.path.join(self.scripts_path, "import_model.py") + " " + os.path.join(self.local_model_repo,"my_model-1.0.zip") - print("\n\t\t\33[32m{}\n{}\33[0m".format("Import command:",command)) - '''popen = subprocess.Popen(command.split(" ") , stdout=subprocess.PIPE, universal_newlines=True) - output = [] - for stdout_line in iter(popen.stdout.readline, ""): - print(stdout_line[:-1]) - if stdout_line.strip()!= "": - output.append(stdout_line[:-1]) - popen.stdout.close() - return_code = popen.wait() - ''' - os.system(command) - - # check it is in store - from cube.io_utils.model_store import ModelMetadata, ModelStore - model_store_object = ModelStore() - local_models = model_store_object.list_local_models() - test = False - for model, version in local_models: - if model == "my_model": - test = True - self.assertTrue(test) - - - def test_3_3_run_model_with_manual_embeddings(self): - print("\n\33[33m{}\33[0m".format("3.3. Run a local model with manual embeddings ...")) - embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec") - print("\t\tPath to local manual embeddings file: "+embeddings) - from cube.api import Cube - cube = Cube(verbose=True) - cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_embeddings_file=embeddings) - text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." - sentences = cube(text) - self.assertTrue(len(sentences)>0) - self.assertTrue(len(sentences[0])>0) - - def test_4_1_package_a_local_model_with_embeddings_link_in_metadata(self): - print("\n\33[33m{}\33[0m".format("4.1. Package a local model with an external embeddings file link...")) - - # create metadata file - with open(os.path.join(self.model_path,"metadata.json"),"w",encoding="utf-8") as f: - f.write("{\n") - f.write('"embeddings_file_name": "wiki.got.vec",\n') - f.write('"embeddings_remote_link": "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.got.vec",\n') - f.write('"language": "UD_English",\n') - f.write('"language_code": "my_model",\n') - f.write('"model_build_date": "2020-01-01",\n') - f.write('"model_build_source": "UD_English-ParTuT",\n') - f.write('"model_version": 1.0,\n') - f.write('"notes": "Source: ud-treebanks-v2.2, dummy model, -got- embeddings because they are small",\n') - f.write('"token_delimiter": " "\n') - f.write("}\n") - - # first cleanup if my_model-1.0.zip already exists - if os.path.exists(os.path.join(self.local_model_repo,"my_model-1.0.zip")): - os.remove(os.path.join(self.local_model_repo,"my_model-1.0.zip")) - - command = "python3 " + os.path.join(self.scripts_path, "export_model.py") + " " + self.model_path - command+= " --tokenizer --tagger --parser --lemmatizer" - print("\n\t\t\33[32m{}\n{}\33[0m".format("Export command:",command)) - '''popen = subprocess.Popen(command.split(" ") , stdout=subprocess.PIPE, universal_newlines=True) - output = [] - for stdout_line in iter(popen.stdout.readline, ""): - print(stdout_line[:-1]) - if stdout_line.strip()!= "": - output.append(stdout_line[:-1]) - popen.stdout.close() - return_code = popen.wait() - ''' - os.system(command) - test = os.path.exists(os.path.join(self.local_model_repo,"my_model-1.0.zip")) - self.assertTrue(test) - - def test_4_2_import_model_in_store(self): - print("\n\33[33m{}\33[0m".format("4.2. Import locally created model in store (with prior cleanup)...")) - - # first check local models - from cube.io_utils.model_store import ModelMetadata, ModelStore - model_store_object = ModelStore() - local_models = model_store_object.list_local_models() - print("\tFound local models:"+str(local_models)) - self.assertTrue(len(local_models)>0) - - # search for my_model - for model, version in local_models: - if model == "my_model": - # delete local model - print("\tDeleting 'my_model-1.0'...") - model_store_object.delete_model("my_model","1.0") - local_models_new = model_store_object.list_local_models() - print("\tFound local models:"+str(local_models_new)) - self.assertTrue(len(local_models)>len(local_models_new)) - - # import new model - command = "python3 " + os.path.join(self.scripts_path, "import_model.py") + " " + os.path.join(self.local_model_repo,"my_model-1.0.zip") - print("\n\t\t\33[32m{}\n{}\33[0m".format("Import command:",command)) - '''popen = subprocess.Popen(command.split(" ") , stdout=subprocess.PIPE, universal_newlines=True) - output = [] - for stdout_line in iter(popen.stdout.readline, ""): - print(stdout_line[:-1]) - if stdout_line.strip()!= "": - output.append(stdout_line[:-1]) - popen.stdout.close() - return_code = popen.wait() - ''' - os.system(command) - - test = os.path.exists(os.path.join(self.local_model_repo,"my_model-1.0.zip")) - self.assertTrue(test) - - # check it is in store - local_models = model_store_object.list_local_models() - test = False - for model, version in local_models: - if model == "my_model": - test = True - self.assertTrue(test) - - def test_4_3_run_model_with_default_external_embeddings(self): - print("\n\33[33m{}\33[0m".format("4.3. Run a local model with default external embeddings ...")) - from cube.api import Cube - cube = Cube(verbose=True) - cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) - text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." - sentences = cube(text) - self.assertTrue(len(sentences)>0) - self.assertTrue(len(sentences[0])>0) - - def test_5_cleanup(self): - print("\n\33[33m{}\33[0m".format("5. Cleanup after myself ...")) - - # delete my_model from the store, if it exists - from cube.io_utils.model_store import ModelMetadata, ModelStore - model_store_object = ModelStore() - local_models = model_store_object.list_local_models() - print("\tFound local models:"+str(local_models)) - self.assertTrue(len(local_models)>0) - - for model, version in local_models: - if model == "my_model": - # delete local model - print("\tDeleting 'my_model-1.0'...") - model_store_object.delete_model("my_model","1.0") - local_models_new = model_store_object.list_local_models() - print("\tFound local models:"+str(local_models_new)) - self.assertTrue(len(local_models)>len(local_models_new)) - break - - # delete my_model.zip, if it exists - if os.path.exists(os.path.join(self.local_model_repo,"my_model-1.0.zip")): - os.remove(os.path.join(self.local_model_repo,"my_model-1.0.zip")) - self.assertFalse(os.path.exists(os.path.join(self.local_model_repo,"my_model-1.0.zip"))) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/tests/main_tests.py b/tests/main_tests.py deleted file mode 100644 index 021338775..000000000 --- a/tests/main_tests.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -This class should test: - -1. Train a very small model with tokenizer -> parser with several options (train) -2. Test the model using the main functions (run) - -""" -import os, sys, subprocess -import unittest - -class Main_Tests(unittest.TestCase): - - def setUp(self): - # get current directory - self.root_path = os.path.dirname(os.path.realpath(__file__)) - self.root_path = os.path.abspath(os.path.join(self.root_path, os.pardir)) - self.main_file_path = os.path.join(self.root_path, "cube", "main.py") - self.corpus_path = os.path.join(self.root_path, "tests", "test_corpus") - self.model_path = os.path.join(self.root_path, "tests", "my_model-1.0") - self.scratch_path = os.path.join(self.root_path, "tests", "scratch") - self.input_file_path = os.path.join(self.corpus_path, "en_ewt-ud-test.txt") - self.output_file_path = os.path.join(self.scratch_path, "en_ewt-ud-test-output.conllu") - - if not os.path.exists(self.model_path): - os.makedirs(self.model_path) - if not os.path.exists(self.scratch_path): - os.makedirs(self.scratch_path) - - #print("\n\n"+"_"*72) - #print("[setUp] Absolute path of NLP-Cube: "+self.root_path) - #print() - - def test_1_tokenizer_training(self): - command = "python3 " + self.main_file_path + " --train tokenizer" - command+= " --train-file "+os.path.join(self.corpus_path,"en_ewt-ud-train.conllu") + " --raw-train-file " + os.path.join(self.corpus_path,"en_ewt-ud-train.txt") - command+= " --dev-file "+os.path.join(self.corpus_path,"en_ewt-ud-dev.conllu") + " --raw-dev-file " + os.path.join(self.corpus_path,"en_ewt-ud-dev.txt") - command+= " --embeddings "+os.path.join(self.root_path, "examples", "wiki.dummy.vec") - command+= " --store " + os.path.join(self.model_path, "tokenizer") - command+= " --autobatch --batch-size 1000 --set-mem 1000 --random-seed 42 --patience 1" - print("\n\33[33m{}\n{}\33[0m".format("Tokenizer command:",command)) - popen = subprocess.Popen(command.split(" ") , stdout=subprocess.PIPE, universal_newlines=True) - output = [] - for stdout_line in iter(popen.stdout.readline, ""): - print(stdout_line[:-1]) - if stdout_line.strip()!= "": - output.append(stdout_line[:-1]) - popen.stdout.close() - return_code = popen.wait() - test = "Training is done with " in output[-1] - self.assertTrue(test) - - def test_2_tagger_training(self): - command = "python3 " + self.main_file_path + " --train tagger" - command+= " --train-file "+os.path.join(self.corpus_path,"en_ewt-ud-train.conllu") - command+= " --dev-file "+os.path.join(self.corpus_path,"en_ewt-ud-dev.conllu") - command+= " --embeddings "+os.path.join(self.root_path, "examples", "wiki.dummy.vec") - command+= " --store " + os.path.join(self.model_path, "tagger") - command+= " --batch-size 500 --set-mem 1000 --patience 1" - print("\n\33[33m{}\n{}\33[0m".format("Tagger command:",command)) - popen = subprocess.Popen(command.split(" ") , stdout=subprocess.PIPE, universal_newlines=True) - output = [] - for stdout_line in iter(popen.stdout.readline, ""): - print(stdout_line[:-1]) - if stdout_line.strip()!= "": - output.append(stdout_line[:-1]) - popen.stdout.close() - return_code = popen.wait() - test = "Training is done with " in output[-1] - self.assertTrue(test) - - - - def test_3_lemmatizer_training(self): - command = "python3 " + self.main_file_path + " --train lemmatizer" - command+= " --train-file "+os.path.join(self.corpus_path,"en_ewt-ud-train.conllu") - command+= " --dev-file "+os.path.join(self.corpus_path,"en_ewt-ud-dev.conllu") - command+= " --embeddings "+os.path.join(self.root_path, "examples", "wiki.dummy.vec") - command+= " --store " + os.path.join(self.model_path, "lemmatizer") - command+= " --batch-size 750 --patience 1" - print("\n\33[33m{}\n{}\33[0m".format("Lemmatizer command:",command)) - popen = subprocess.Popen(command.split(" ") , stdout=subprocess.PIPE, universal_newlines=True) - output = [] - for stdout_line in iter(popen.stdout.readline, ""): - print(stdout_line[:-1]) - if stdout_line.strip()!= "": - output.append(stdout_line[:-1]) - popen.stdout.close() - return_code = popen.wait() - test = "Training is done with " in output[-1] - self.assertTrue(test) - - def test_4_parser_training(self): - command = "python3 " + self.main_file_path + " --train parser" - command+= " --train-file "+os.path.join(self.corpus_path,"en_ewt-ud-train.conllu") - command+= " --dev-file "+os.path.join(self.corpus_path,"en_ewt-ud-dev.conllu") - command+= " --embeddings "+os.path.join(self.root_path, "examples", "wiki.dummy.vec") - command+= " --store " + os.path.join(self.model_path, "parser") - command+= " --batch-size 1000 --set-mem 950 --patience 1" - print("\n\33[33m{}\n{}\33[0m".format("Parser command:",command)) - popen = subprocess.Popen(command.split(" ") , stdout=subprocess.PIPE, universal_newlines=True) - output = [] - for stdout_line in iter(popen.stdout.readline, ""): - print(stdout_line[:-1]) - if stdout_line.strip()!= "": - output.append(stdout_line[:-1]) - popen.stdout.close() - return_code = popen.wait() - test = "Training is done with " in output[-7] - self.assertTrue(test) - - def test_5_run_model(self): - command = "python3 " + self.main_file_path + " --run tokenizer,parser,tagger,lemmatizer" - command+= " --models " + self.model_path - command+= " --embeddings " + os.path.join(self.root_path, "examples", "wiki.dummy.vec") - command+= " --input-file " + self.input_file_path - command+= " --output-file " + self.output_file_path - print("\n\33[33m{}\n{}\33[0m".format("Model run command:",command)) - popen = subprocess.Popen(command.split(" ") , stdout=subprocess.PIPE, universal_newlines=True) - for stdout_line in iter(popen.stdout.readline, ""): - print(stdout_line[:-1]) - popen.stdout.close() - return_code = popen.wait() - self.assertTrue(return_code == 0) - - lines = [] - with open(self.output_file_path,"r",encoding="utf8") as f: - lines = [line for line in f.readlines() if line.strip() != ""] - test = "treaty" in lines[-2] - self.assertTrue(test) - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/tests/scratch/en_ewt-ud-test-output.conllu b/tests/scratch/en_ewt-ud-test-output.conllu deleted file mode 100644 index e160ced9f..000000000 --- a/tests/scratch/en_ewt-ud-test-output.conllu +++ /dev/null @@ -1,512 +0,0 @@ -1 What wwwwwwww NOUN NNP _ 13 det _ _ -2 if iiii NOUN IN _ 13 det _ _ -3 Google gggggg NOUN NNP _ 13 det _ _ -4 Morphed mmmmm NOUN NNP _ 13 det _ _ -5 Into iiiiii NOUN NNP _ 13 det _ _ -6 GoogleOS? ggggggggssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss NOUN NNP _ 13 det _ _ -7 What wwwwwwww NOUN NNP _ 13 det _ _ -8 if iiii NOUN IN _ 13 det _ _ -9 Google ggggg NOUN JJ _ 13 det _ _ -10 expanded eee NOUN NNS _ 13 det _ _ -11 on oooo NOUN IN _ 13 det _ _ -12 its iiii NOUN DT _ 13 det _ _ -13 search- sssss NOUN JJ _ 0 det _ SpaceAfter=No -14 engine eeee NOUN JJ _ 13 det _ _ -15 (and ((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((( NOUN NNP _ 13 det _ _ -16 now nnnn NOUN RB _ 13 det _ _ -17 e- eeee NOUN CD _ 13 det _ SpaceAfter=No -18 mail) mmmm NOUN , _ 13 det _ _ -19 wares ww NOUN NNS _ 13 det _ _ -20 into iii NOUN IN _ 13 det _ _ -21 a aaaaaa NOUN DT _ 13 det _ _ -22 full- fffffffff------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ NOUN NNP _ 13 det _ SpaceAfter=No -23 fledged ffff NOUN JJ _ 13 det _ _ -24 operating oooo NOUN JJ _ 13 det _ _ -25 system? ssssss NOUN NNP _ 13 det _ _ -26 [via [[[[[[[[[ NOUN NNP _ 13 det _ _ -27 Microsoft mmmmm NOUN NNP _ 13 det _ _ -28 Watch wwwwwww NOUN NNP _ 13 det _ _ -29 from ffff NOUN IN _ 13 det _ _ -30 Mary mmmmmmm NOUN NNP _ 13 det _ _ -31 Jo jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj NOUN NNP _ 13 det _ _ -32 Foley fffffff NOUN NNP _ 13 det _ _ -33 ] ]]]]]]] NOUN , _ 13 det _ _ - -1 ( (((((( NOUN HYPH _ 24 det _ SpaceAfter=No -2 And aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa NOUN NNP _ 24 det _ SpaceAfter=No -3 , ,,,,,,,, NOUN , _ 24 det _ _ -4 by bbbb NOUN IN _ 24 det _ _ -5 the tttt NOUN DT _ 24 det _ _ -6 way, wwww NOUN , _ 24 det _ _ -7 is iiii NOUN VBZ _ 24 det _ _ -8 anybody aaa NOUN IN _ 24 det _ _ -9 else eeeeeee NOUN NN _ 24 det _ _ -10 just jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj NOUN NN _ 24 det _ SpaceAfter=No -11 a aaaaaa NOUN DT _ 24 det _ _ -12 little llll NOUN JJ _ 24 det _ _ -13 nostalgic nnnnn NOUN NN _ 24 det _ _ -14 for ffff NOUN IN _ 24 det _ _ -15 the tttt NOUN DT _ 24 det _ _ -16 days dddddddd NOUN NN _ 24 det _ _ -17 when wwwwwww NOUN NN _ 24 det _ _ -18 that ttt NOUN IN _ 24 det _ _ -19 was wwww NOUN VBZ _ 24 det _ _ -20 a aaaaaa NOUN DT _ 24 det _ _ -21 good gggggggg NOUN NN _ 24 det _ _ -22 thing? tttt NOUN RB _ 24 det _ SpaceAfter=No -23 ) ))))))) NOUN , _ 24 det _ _ -24 This ttttt NOUN DT _ 0 det _ _ -25 BuzzMachine bbbbbb NOUN NNP _ 24 det _ _ -26 post pppppppp NOUN NN _ 24 det _ _ -27 argues aa NOUN NNS _ 24 det _ _ -28 that ttt NOUN IN _ 24 det _ _ -29 Google's ggggg NOUN JJ _ 24 det _ _ -30 rush rrrrr NOUN JJ _ 24 det _ _ -31 toward ttttt NOUN JJ _ 24 det _ _ -32 ubiquity uuuuu NOUN JJ _ 24 det _ _ -33 might mmmmm NOUN JJ _ 24 det _ _ -34 backfire bbbbbb NOUN NN _ 24 det _ _ -35 - ------- NOUN , _ 24 det _ SpaceAfter=No -36 - ------- NOUN , _ 24 det _ _ -37 which wwwww NOUN NNP _ 24 det _ _ -38 we've ww NOUN VBD _ 24 det _ _ -39 all aaaaaaa NOUN NNP _ 24 det _ _ -40 heard hhh NOUN RB _ 24 det _ _ -41 before, bbb NOUN CD _ 24 det _ _ -42 but bbbb NOUN IN _ 24 det _ _ -43 it's iiii NOUN PRP _ 24 det _ _ -44 particularly pppp NOUN JJ _ 24 det _ _ -45 well www NOUN NNS _ 24 det _ SpaceAfter=No -46 - ------- NOUN , _ 24 det _ SpaceAfter=No -47 put ppp NOUN NNS _ 24 det _ _ -48 in iiii NOUN IN _ 24 det _ _ -49 this tttt NOUN DT _ 24 det _ _ -50 post pppppppp NOUN NN _ 24 det _ SpaceAfter=No -51 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 24 det _ _ - -1 Google ggggggg NOUN NN _ 3 det _ _ -2 is iiii NOUN VBZ _ 3 det _ _ -3 a aaaaaa NOUN DT _ 0 det _ _ -4 nice nnnn NOUN JJ _ 3 det _ _ -5 search ssss NOUN JJ _ 3 det _ _ -6 engine eeeee NOUN NN _ 3 det _ SpaceAfter=No -7 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 3 det _ _ - -1 Does ddddddd NOUN NNP _ 5 det _ _ -2 anybody aaa NOUN IN _ 5 det _ _ -3 use uuu NOUN VBD _ 5 det _ _ -4 it iiii NOUN VBZ _ 5 det _ _ -5 for ffff NOUN IN _ 0 det _ _ -6 anything aaa NOUN RB _ 5 det _ _ -7 else? eee NOUN CD _ 5 det _ _ -8 They tttt NOUN PRP _ 5 det _ _ -9 own oooo NOUN IN _ 5 det _ _ -10 blogger bbb NOUN NNS _ 5 det _ SpaceAfter=No -11 , ,,,,,,,, NOUN , _ 5 det _ _ -12 of oooo NOUN IN _ 5 det _ _ -13 course ccc NOUN NNS _ 5 det _ SpaceAfter=No -14 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 5 det _ _ - -1 Is iiiii NOUN VBZ _ 5 det _ _ -2 that ttt NOUN IN _ 5 det _ _ -3 a aaaaaa NOUN DT _ 5 det _ _ -4 money mmmmmm NOUN NN _ 5 det _ _ -5 maker? mmmmmm NOUN NNP _ 0 det _ _ -6 I'm iii NOUN VBD _ 5 det _ _ -7 staying ssssss NOUN NN _ 5 det _ _ -8 away aaaaaaa NOUN NN _ 5 det _ _ -9 from ffff NOUN IN _ 5 det _ _ -10 the tttt NOUN DT _ 5 det _ _ -11 stock ssssssss NOUN NN _ 5 det _ SpaceAfter=No -12 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 5 det _ _ - -1 I iiiiiiii NOUN VBZ _ 9 det _ _ -2 doubt ddd NOUN IN _ 9 det _ _ -3 the tttt NOUN DT _ 9 det _ _ -4 very vvvvvvv NOUN NN _ 9 det _ _ -5 few ffff NOUN RB _ 10 det _ _ -6 who wwww NOUN TO _ 10 det _ _ -7 actually aaaaa NOUN JJ _ 10 det _ _ -8 read rrr NOUN NNS _ 10 det _ _ -9 my mmmm NOUN IN _ 10 det _ _ -10 blog bbbb NOUN RB _ 0 det _ _ -11 have hhhhhh NOUN VBN _ 10 det _ _ -12 not nnnn NOUN IN _ 10 det _ _ -13 come ccc NOUN VB _ 10 det _ _ -14 across aaaaaa NOUN NN _ 10 det _ _ -15 this tttt NOUN PRP _ 10 det _ _ -16 yet, yyyy NOUN , _ 10 det _ _ -17 but bbbb NOUN IN _ 10 det _ _ -18 I iiiiiiii NOUN DT _ 10 det _ _ -19 figured ffffff NOUN NN _ 10 det _ _ -20 I iiiiiiii NOUN DT _ 10 det _ _ -21 would wwww NOUN RB _ 10 det _ _ -22 put pppp NOUN RB _ 10 det _ _ -23 it iiii NOUN PRP _ 10 det _ _ -24 out oooo NOUN RB _ 10 det _ _ -25 there ttt NOUN VB _ 10 det _ _ -26 anyways aaaaaa NOUN NN _ 10 det _ SpaceAfter=No -27 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 10 det _ _ - -1 John jjjjjjjj NOUN NNP _ 7 det _ _ -2 Donovan dddddd NOUN NNP _ 7 det _ _ -3 from ffff NOUN IN _ 7 det _ _ -4 Argghhh! aaaaa NOUN NNP _ 7 det _ _ -5 has hhhh NOUN VBZ _ 7 det _ _ -6 put ppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp NOUN NN _ 7 det _ _ -7 out oooo NOUN IN _ 0 det _ SpaceAfter=No -8 a aaaaaa NOUN DT _ 7 det _ _ -9 excellent eeeee NOUN NN _ 7 det _ _ -10 slide ssssss NOUN NN _ 7 det _ _ -11 show ssss NOUN RB _ 7 det _ _ -12 on oooo NOUN IN _ 7 det _ _ -13 what wwww NOUN RB _ 7 det _ _ -14 was wwww NOUN VBZ _ 7 det _ _ -15 actually aaaaaa NOUN NN _ 7 det _ _ -16 found ffff NOUN RB _ 7 det _ _ -17 and aaa NOUN CC _ 7 det _ _ -18 fought ffff NOUN RB _ 7 det _ _ -19 for ffff NOUN IN _ 7 det _ _ -20 in iiii NOUN IN _ 7 det _ _ -21 Fallujah ffffff NOUN NNP _ 7 det _ SpaceAfter=No -22 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 7 det _ _ - -1 Click cccccc NOUN NNP _ 2 det _ _ -2 here hh NOUN VBD _ 0 det _ _ -3 To ttttt NOUN , _ 2 det _ _ -4 view vvvvvv NOUN VBN _ 2 det _ _ -5 it iiii NOUN RB _ 2 det _ SpaceAfter=No -6 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 2 det _ _ - -1 He hhhhh NOUN PRP _ 4 det _ _ -2 makes mmmmmmm NOUN NN _ 4 det _ _ -3 some sssssss NOUN VBN _ 4 det _ _ -4 good ggggggg NOUN VBN _ 0 det _ _ -5 observations ooo NOUN NNS _ 4 det _ _ -6 on oooo NOUN IN _ 4 det _ _ -7 a aaaaaa NOUN DT _ 4 det _ _ -8 few fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff NOUN NN _ 4 det _ _ -9 of oooo NOUN IN _ 4 det _ _ -10 the tttt NOUN DT _ 4 det _ _ -11 pic's ppppppp NOUN NN _ 4 det _ SpaceAfter=No -12 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 4 det _ _ - -1 One ooo NOUN NNS _ 5 det _ _ -2 of oooo NOUN IN _ 5 det _ _ -3 the tttt NOUN DT _ 5 det _ _ -4 pictures ppppp NOUN NN _ 5 det _ _ -5 shows ss NOUN VBD _ 0 det _ _ -6 a aaaaaa NOUN DT _ 5 det _ _ -7 flag ffffffff NOUN NN _ 5 det _ _ -8 that ttt NOUN IN _ 5 det _ _ -9 was wwww NOUN VBZ _ 5 det _ _ -10 found ffff NOUN RB _ 5 det _ _ -11 in iiii NOUN IN _ 5 det _ _ -12 Fallujah ffffff NOUN NNP _ 5 det _ SpaceAfter=No -13 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 5 det _ _ - -1 On ooooo NOUN IN _ 5 det _ _ -2 the tttt NOUN DT _ 5 det _ _ -3 next nnnnnnn NOUN NN _ 5 det _ _ -4 two tttt NOUN CD _ 5 det _ _ -5 pictures ppp NOUN RB _ 0 det _ _ -6 he hhhh NOUN PRP _ 5 det _ _ -7 took tttt NOUN RB _ 5 det _ _ -8 screenshots sss NOUN NNS _ 5 det _ _ -9 of oooo NOUN IN _ 5 det _ _ -10 two tttt NOUN PRP _ 5 det _ _ -11 beheading bbb NOUN RB _ 5 det _ _ -12 video's vvv NOUN NNS _ 5 det _ SpaceAfter=No -13 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 5 det _ _ - -1 Compare ccccc NOUN NNP _ 3 det _ _ -2 the tttt NOUN DT _ 3 det _ _ -3 flags fffffff NOUN NN _ 0 det _ _ -4 to tttt NOUN IN _ 3 det _ _ -5 the tttt NOUN DT _ 3 det _ _ -6 Fallujah ffffff NOUN NNP _ 3 det _ _ -7 one oooooooooeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee NOUN NN _ 3 det _ SpaceAfter=No -8 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 3 det _ _ - -1 You yyyyy NOUN RB _ 3 det _ _ -2 have hh NOUN VBD _ 3 det _ _ -3 to ttttt NOUN TO _ 0 det _ _ -4 see sss NOUN VB _ 3 det _ _ -5 these ttt NOUN VB _ 3 det _ _ -6 slides sss NOUN NNS _ 3 det _ SpaceAfter=No -7 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 3 det _ SpaceAfter=No - -1 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 0 det _ SpaceAfter=No - -1 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 0 det _ SpaceAfter=No - -1 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 0 det _ SpaceAfter=No - -1 they tttt NOUN PRP _ 2 det _ _ -2 are aaaaa NOUN VBP _ 0 det _ _ -3 amazing aaaaa NOUN VBN _ 2 det _ SpaceAfter=No -4 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 2 det _ _ - -1 This ttttt NOUN DT _ 9 det _ _ -2 Fallujah ffffff NOUN NNP _ 9 det _ _ -3 operation ooooo NOUN NN _ 9 det _ _ -4 my mmmm NOUN IN _ 9 det _ _ -5 turn tttttttt NOUN NN _ 9 det _ _ -6 out oooo NOUN RB _ 9 det _ _ -7 to ttttt NOUN TO _ 9 det _ _ -8 be bbbb NOUN VB _ 9 det _ _ -9 the tttt NOUN DT _ 0 det _ _ -10 most mmmmmmmmmttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt NOUN NN _ 9 det _ _ -11 important iiiii NOUN NN _ 9 det _ _ -12 operation ooooo NOUN NN _ 9 det _ _ -13 done ddd NOUN RB _ 9 det _ _ -14 by bbbb NOUN IN _ 9 det _ _ -15 the tttt NOUN DT _ 9 det _ _ -16 US uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu NOUN NNP _ 9 det _ _ -17 Military mmmmm NOUN NNP _ 9 det _ _ -18 since sss NOUN IN _ 9 det _ _ -19 the tttt NOUN DT _ 9 det _ _ -20 end eeeeeeeeeeddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd NOUN NN _ 9 det _ _ -21 of oooo NOUN IN _ 9 det _ _ -22 the tttt NOUN DT _ 9 det _ _ -23 war wwwwwwwwwwrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr NOUN NN _ 9 det _ SpaceAfter=No -24 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 9 det _ _ - -1 Let lllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllll NOUN NNP _ 9 det _ _ -2 me mmm NOUN VBD _ 9 det _ _ -3 join jjj NOUN IN _ 10 det _ _ -4 the tttt NOUN DT _ 10 det _ _ -5 chorus cccccc NOUN NN _ 10 det _ _ -6 of oooo NOUN IN _ 10 det _ _ -7 annoyance aaaa NOUN JJ _ 10 det _ _ -8 over ooo NOUN RB _ 10 det _ _ -9 Google's ggggg NOUN JJ _ 10 det _ _ -10 new nnnn NOUN RB _ 0 det _ _ -11 toolbar ttt NOUN NNS _ 10 det _ _ -12 , ,,,,,,,, NOUN , _ 10 det _ _ -13 which, wwww NOUN , _ 10 det _ _ -14 as aaaaa NOUN DT _ 10 det _ _ -15 noted nnn NOUN NNS _ 10 det _ _ -16 in iiii NOUN IN _ 10 det _ _ -17 the tttt NOUN DT _ 10 det _ _ -18 linked lllll NOUN NN _ 10 det _ _ -19 article, aaaaa NOUN NNP _ 10 det _ _ -20 commits ccccc NOUN JJ _ 10 det _ _ -21 just jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj NOUN NN _ 10 det _ _ -22 about aaa NOUN IN _ 10 det _ _ -23 every eee NOUN NNS _ 10 det _ _ -24 sin sss NOUN IN _ 10 det _ _ -25 an aaaa NOUN IN _ 10 det _ _ -26 online oooo NOUN JJ _ 10 det _ _ -27 marketer mmmm NOUN JJ _ 10 det _ _ -28 could cccc NOUN RB _ 10 det _ _ -29 commit, ccccc NOUN NNP _ 10 det _ _ -30 and aaa NOUN CC _ 10 det _ _ -31 makes mmm NOUN NNS _ 10 det _ _ -32 up uuuu NOUN IN _ 10 det _ _ -33 a aaaaaa NOUN DT _ 10 det _ _ -34 few fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff NOUN NN _ 10 det _ _ -35 new nnnn NOUN RB _ 10 det _ _ -36 ones oooo NOUN RB _ 10 det _ _ -37 besides bbb NOUN NNS _ 10 det _ SpaceAfter=No -38 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 10 det _ _ - -1 I'm iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii NOUN NNP _ 9 det _ _ -2 not nnnn NOUN RB _ 9 det _ _ -3 fond ffff NOUN RB _ 9 det _ _ -4 of oooo NOUN IN _ 9 det _ _ -5 the tttt NOUN DT _ 9 det _ _ -6 Google- gggggg NOUN NNP _ 9 det _ SpaceAfter=No -7 hates- hhhhh NOUN NNP _ 9 det _ SpaceAfter=No -8 privacy pppp NOUN JJ _ 9 det _ _ -9 argument aaaaa NOUN NN _ 0 det _ _ -10 ( ((((((( NOUN , _ 9 det _ SpaceAfter=No -11 You yyyyy NOUN RB _ 9 det _ _ -12 don't dddd NOUN RB _ 9 det _ _ -13 need nnn NOUN RB _ 9 det _ _ -14 to ttttt NOUN TO _ 9 det _ _ -15 use uuuu NOUN VB _ 9 det _ _ -16 their ttt NOUN VB _ 9 det _ _ -17 site, ssss NOUN , _ 9 det _ _ -18 you yyyy NOUN IN _ 9 det _ _ -19 can ccccc NOUN JJ _ 9 det _ _ -20 opt- ooo NOUN HYPH _ 9 det _ SpaceAfter=No -21 out oooo NOUN RB _ 9 det _ _ -22 of oooo NOUN IN _ 9 det _ _ -23 sharing sss NOUN RB _ 9 det _ _ -24 your yyyy NOUN RB _ 9 det _ _ -25 information iii NOUN RB _ 9 det _ SpaceAfter=No -26 , ,,,,,,,, NOUN , _ 9 det _ _ -27 you yyyy NOUN RB _ 9 det _ _ -28 don't dddd NOUN RB _ 9 det _ _ -29 need nnn NOUN RB _ 9 det _ _ -30 to ttttt NOUN TO _ 9 det _ _ -31 send ssss NOUN RB _ 9 det _ _ -32 stuff ssss NOUN IN _ 9 det _ _ -33 to ttttt NOUN TO _ 9 det _ _ -34 anyone aaa NOUN RB _ 9 det _ _ -35 with www NOUN IN _ 9 det _ _ -36 a aaaaaa NOUN DT _ 9 det _ _ -37 Gmail gggggg NOUN NNP _ 9 det _ _ -38 account aaaaa NOUN NNP _ 9 det _ SpaceAfter=No -39 , ,,,,,,,, NOUN , _ 9 det _ _ -40 and aaa NOUN CC _ 9 det _ _ -41 if iiii NOUN IN _ 9 det _ _ -42 - ------ NOUN HYPH _ 9 det _ SpaceAfter=No -43 - ------ NOUN HYPH _ 9 det _ _ -44 wonder www NOUN NNS _ 9 det _ _ -45 of oooo NOUN IN _ 9 det _ _ -46 wonders www NOUN NNS _ 9 det _ _ -47 - ------- NOUN , _ 9 det _ SpaceAfter=No -48 - ------- NOUN , _ 9 det _ _ -49 you're yyyy NOUN RB _ 9 det _ _ -50 worried ww NOUN NNS _ 9 det _ _ -51 that ttt NOUN IN _ 9 det _ _ -52 you yyyy NOUN PRP _ 9 det _ _ -53 might mmm NOUN RB _ 9 det _ _ -54 send ssss NOUN RB _ 9 det _ _ -55 something sss NOUN NNS _ 9 det _ _ -56 to ttttt NOUN TO _ 9 det _ _ -57 someone sss NOUN RB _ 9 det _ _ -58 who wwww NOUN VB _ 9 det _ _ -59 would wwww NOUN RB _ 9 det _ _ -60 forward ffff NOUN RB _ 9 det _ _ -61 an aaaa NOUN IN _ 9 det _ _ -62 excerpt eeeeee NOUN NN _ 9 det _ _ -63 to ttttt NOUN TO _ 9 det _ _ -64 someone sss NOUN RB _ 9 det _ _ -65 who wwww NOUN VB _ 9 det _ _ -66 would wwww NOUN VB _ 9 det _ _ -67 then ttt NOUN VB _ 9 det _ _ -68 store sss NOUN RB _ 9 det _ _ -69 it iiii NOUN RB _ 9 det _ _ -70 on oooo NOUN IN _ 9 det _ _ -71 a aaaaaa NOUN DT _ 9 det _ _ -72 Gmail gggggg NOUN NNP _ 9 det _ _ -73 account aaaaaa NOUN NN _ 9 det _ SpaceAfter=No -74 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 9 det _ SpaceAfter=No - -1 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 0 det _ SpaceAfter=No - -1 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 0 det _ _ - -1 you yyyy NOUN RB _ 5 det _ _ -2 have hh NOUN VBD _ 5 det _ _ -3 far fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff NOUN NN _ 5 det _ SpaceAfter=No -4 , ,,,,,,,, NOUN , _ 5 det _ _ -5 far fffffff NOUN NNP _ 0 det _ _ -6 too tttt NOUN , _ 5 det _ _ -7 much mmmm NOUN RB _ 5 det _ _ -8 time ttt NOUN NNS _ 5 det _ _ -9 on oooo NOUN IN _ 5 det _ _ -10 your yyyy NOUN RB _ 5 det _ _ -11 hands hhh NOUN RB _ 5 det _ SpaceAfter=No -12 ) ))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))) NOUN . _ 5 det _ SpaceAfter=No -13 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 5 det _ _ - -1 However hhhhh NOUN NNP _ 3 det _ SpaceAfter=No -2 , ,,,,,,,, NOUN , _ 3 det _ _ -3 this tttt NOUN VBZ _ 0 det _ _ -4 toolbar tttttt NOUN NN _ 3 det _ _ -5 is iiii NOUN VBZ _ 3 det _ _ -6 really rrrr NOUN JJ _ 3 det _ _ -7 bad bbbbbb NOUN JJ _ 3 det _ _ -8 news nnn NOUN NNS _ 3 det _ SpaceAfter=No -9 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 3 det _ _ - -1 On ooooo NOUN IN _ 4 det _ _ -2 the tttt NOUN DT _ 4 det _ _ -3 other ooooo NOUN NNP _ 4 det _ _ -4 hand hhhhh NOUN NNP _ 0 det _ SpaceAfter=No -5 , ,,,,,,,, NOUN , _ 4 det _ _ -6 it iiii NOUN PRP _ 4 det _ _ -7 looks lllll NOUN JJ _ 4 det _ _ -8 pretty pppp NOUN JJ _ 4 det _ _ -9 cool ccc NOUN NNS _ 4 det _ SpaceAfter=No -10 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 4 det _ _ - -1 Iran iiiiiii NOUN NN _ 5 det _ _ -2 says ssssssss NOUN NN _ 5 det _ _ -3 it iiii NOUN IN _ 5 det _ _ -4 is iiii NOUN VBZ _ 5 det _ _ -5 creating cccc NOUN JJ _ 0 det _ _ -6 nuclear nnnnn NOUN JJ _ 5 det _ _ -7 energy eeeee NOUN NN _ 5 det _ _ -8 without www NOUN IN _ 5 det _ _ -9 wanting www NOUN VB _ 5 det _ _ -10 nuclear nnnnn NOUN JJ _ 5 det _ _ -11 weapons www NOUN NNS _ 5 det _ SpaceAfter=No -12 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 5 det _ _ - -1 The ttttt NOUN DT _ 3 det _ _ -2 United uuuuuu NOUN NNP _ 3 det _ _ -3 States ssssss NOUN NNP _ 0 det _ _ -4 doesn't dddddd NOUN NN _ 3 det _ _ -5 believe bbb NOUN IN _ 3 det _ _ -6 the tttt NOUN DT _ 3 det _ _ -7 Iranian iiii NOUN JJ _ 3 det _ _ -8 Government gggggg NOUN NN _ 3 det _ SpaceAfter=No -9 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 3 det _ _ - -1 One oooo NOUN CD _ 3 det _ _ -2 can cccccccccnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn NOUN NN _ 3 det _ _ -3 suspect sss NOUN IN _ 0 det _ _ -4 the tttt NOUN DT _ 3 det _ _ -5 Iranian iiiii NOUN NNP _ 3 det _ _ -6 Government gggggg NOUN NN _ 3 det _ SpaceAfter=No -7 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 3 det _ _ - -1 But bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb NOUN NNP _ 2 det _ _ -2 there tt NOUN VBD _ 0 det _ _ -3 is iiii NOUN VBZ _ 2 det _ _ -4 no nnnn NOUN RB _ 2 det _ _ -5 proof ppp NOUN RB _ 2 det _ _ -6 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 2 det _ _ - -1 I iiiiiiii NOUN DT _ 7 det _ _ -2 read rrrrrr NOUN NN _ 7 det _ _ -3 an aaaa NOUN IN _ 7 det _ _ -4 Article aaaaa NOUN NNP _ 7 det _ _ -5 in iiii NOUN IN _ 7 det _ _ -6 Time tttt NOUN DT _ 7 det _ _ -7 magazine mmmmm NOUN NN _ 0 det _ _ -8 accusing aaa NOUN IN _ 7 det _ _ -9 the tttt NOUN DT _ 7 det _ _ -10 Iranian iiii NOUN JJ _ 7 det _ _ -11 Government gggggg NOUN NN _ 7 det _ _ -12 of oooo NOUN IN _ 7 det _ _ -13 being bbb NOUN RB _ 7 det _ _ -14 willing ww NOUN NNS _ 7 det _ _ -15 to ttttt NOUN TO _ 7 det _ _ -16 start ssss NOUN RB _ 7 det _ _ -17 a aaaaaa NOUN DT _ 7 det _ _ -18 nuclear nnnnn NOUN NNP _ 7 det _ _ -19 war wwwwwwwwwwrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr NOUN NN _ 8 det _ _ -20 and aaa NOUN CC _ 8 det _ _ -21 I iiiiiiii NOUN DT _ 8 det _ _ -22 sympathise ssssss NOUN NN _ 8 det _ _ -23 with www NOUN IN _ 8 det _ _ -24 the tttt NOUN DT _ 8 det _ _ -25 Article aaaaa NOUN NNP _ 8 det _ SpaceAfter=No -26 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 8 det _ _ - -1 They tttt NOUN PRP _ 7 det _ _ -2 are aa NOUN VBD _ 7 det _ _ -3 certainly cccc NOUN JJ _ 7 det _ _ -4 being bbbbbb NOUN VBN _ 7 det _ _ -5 nasty nnnnnnn NOUN NN _ 7 det _ _ -6 to tttt NOUN IN _ 7 det _ _ -7 the tttt NOUN DT _ 0 det _ _ -8 United uuuuuu NOUN NNP _ 7 det _ _ -9 Nations nnnnnn NOUN NNP _ 7 det _ _ -10 Security ssssss NOUN NNP _ 7 det _ _ -11 Council cccccc NOUN NNP _ 7 det _ _ -12 in iiii NOUN IN _ 7 det _ _ -13 connection ccccc NOUN NN _ 7 det _ _ -14 with www NOUN IN _ 7 det _ _ -15 the tttt NOUN DT _ 7 det _ _ -16 anti- aaaaaa NOUN NNP _ 7 det _ SpaceAfter=No -17 proliferation pppp NOUN JJ _ 7 det _ _ -18 treaty tttttt NOUN NN _ 7 det _ SpaceAfter=No -19 . ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... NOUN . _ 7 det _ _ - diff --git a/tests/test_corpus/LICENSE.txt b/tests/test_corpus/LICENSE.txt deleted file mode 100644 index 547fb999d..000000000 --- a/tests/test_corpus/LICENSE.txt +++ /dev/null @@ -1,426 +0,0 @@ -Attribution-ShareAlike 4.0 International - -======================================================================= - -Creative Commons Corporation ("Creative Commons") is not a law firm and -does not provide legal services or legal advice. Distribution of -Creative Commons public licenses does not create a lawyer-client or -other relationship. Creative Commons makes its licenses and related -information available on an "as-is" basis. Creative Commons gives no -warranties regarding its licenses, any material licensed under their -terms and conditions, or any related information. Creative Commons -disclaims all liability for damages resulting from their use to the -fullest extent possible. - -Using Creative Commons Public Licenses - -Creative Commons public licenses provide a standard set of terms and -conditions that creators and other rights holders may use to share -original works of authorship and other material subject to copyright -and certain other rights specified in the public license below. The -following considerations are for informational purposes only, are not -exhaustive, and do not form part of our licenses. - - Considerations for licensors: Our public licenses are - intended for use by those authorized to give the public - permission to use material in ways otherwise restricted by - copyright and certain other rights. Our licenses are - irrevocable. Licensors should read and understand the terms - and conditions of the license they choose before applying it. - Licensors should also secure all rights necessary before - applying our licenses so that the public can reuse the - material as expected. Licensors should clearly mark any - material not subject to the license. This includes other CC- - licensed material, or material used under an exception or - limitation to copyright. More considerations for licensors: - wiki.creativecommons.org/Considerations_for_licensors - - Considerations for the public: By using one of our public - licenses, a licensor grants the public permission to use the - licensed material under specified terms and conditions. If - the licensor's permission is not necessary for any reason--for - example, because of any applicable exception or limitation to - copyright--then that use is not regulated by the license. Our - licenses grant only permissions under copyright and certain - other rights that a licensor has authority to grant. Use of - the licensed material may still be restricted for other - reasons, including because others have copyright or other - rights in the material. A licensor may make special requests, - such as asking that all changes be marked or described. - Although not required by our licenses, you are encouraged to - respect those requests where reasonable. More_considerations - for the public: - wiki.creativecommons.org/Considerations_for_licensees - -======================================================================= - -Creative Commons Attribution-ShareAlike 4.0 International Public -License - -By exercising the Licensed Rights (defined below), You accept and agree -to be bound by the terms and conditions of this Creative Commons -Attribution-ShareAlike 4.0 International Public License ("Public -License"). To the extent this Public License may be interpreted as a -contract, You are granted the Licensed Rights in consideration of Your -acceptance of these terms and conditions, and the Licensor grants You -such rights in consideration of benefits the Licensor receives from -making the Licensed Material available under these terms and -conditions. - - -Section 1 -- Definitions. - - a. Adapted Material means material subject to Copyright and Similar - Rights that is derived from or based upon the Licensed Material - and in which the Licensed Material is translated, altered, - arranged, transformed, or otherwise modified in a manner requiring - permission under the Copyright and Similar Rights held by the - Licensor. For purposes of this Public License, where the Licensed - Material is a musical work, performance, or sound recording, - Adapted Material is always produced where the Licensed Material is - synched in timed relation with a moving image. - - b. Adapter's License means the license You apply to Your Copyright - and Similar Rights in Your contributions to Adapted Material in - accordance with the terms and conditions of this Public License. - - c. BY-SA Compatible License means a license listed at - creativecommons.org/compatiblelicenses, approved by Creative - Commons as essentially the equivalent of this Public License. - - d. Copyright and Similar Rights means copyright and/or similar rights - closely related to copyright including, without limitation, - performance, broadcast, sound recording, and Sui Generis Database - Rights, without regard to how the rights are labeled or - categorized. For purposes of this Public License, the rights - specified in Section 2(b)(1)-(2) are not Copyright and Similar - Rights. - - e. Effective Technological Measures means those measures that, in the - absence of proper authority, may not be circumvented under laws - fulfilling obligations under Article 11 of the WIPO Copyright - Treaty adopted on December 20, 1996, and/or similar international - agreements. - - f. Exceptions and Limitations means fair use, fair dealing, and/or - any other exception or limitation to Copyright and Similar Rights - that applies to Your use of the Licensed Material. - - g. License Elements means the license attributes listed in the name - of a Creative Commons Public License. The License Elements of this - Public License are Attribution and ShareAlike. - - h. Licensed Material means the artistic or literary work, database, - or other material to which the Licensor applied this Public - License. - - i. Licensed Rights means the rights granted to You subject to the - terms and conditions of this Public License, which are limited to - all Copyright and Similar Rights that apply to Your use of the - Licensed Material and that the Licensor has authority to license. - - j. Licensor means the individual(s) or entity(ies) granting rights - under this Public License. - - k. Share means to provide material to the public by any means or - process that requires permission under the Licensed Rights, such - as reproduction, public display, public performance, distribution, - dissemination, communication, or importation, and to make material - available to the public including in ways that members of the - public may access the material from a place and at a time - individually chosen by them. - - l. Sui Generis Database Rights means rights other than copyright - resulting from Directive 96/9/EC of the European Parliament and of - the Council of 11 March 1996 on the legal protection of databases, - as amended and/or succeeded, as well as other essentially - equivalent rights anywhere in the world. - - m. You means the individual or entity exercising the Licensed Rights - under this Public License. Your has a corresponding meaning. - - -Section 2 -- Scope. - - a. License grant. - - 1. Subject to the terms and conditions of this Public License, - the Licensor hereby grants You a worldwide, royalty-free, - non-sublicensable, non-exclusive, irrevocable license to - exercise the Licensed Rights in the Licensed Material to: - - a. reproduce and Share the Licensed Material, in whole or - in part; and - - b. produce, reproduce, and Share Adapted Material. - - 2. Exceptions and Limitations. For the avoidance of doubt, where - Exceptions and Limitations apply to Your use, this Public - License does not apply, and You do not need to comply with - its terms and conditions. - - 3. Term. The term of this Public License is specified in Section - 6(a). - - 4. Media and formats; technical modifications allowed. The - Licensor authorizes You to exercise the Licensed Rights in - all media and formats whether now known or hereafter created, - and to make technical modifications necessary to do so. The - Licensor waives and/or agrees not to assert any right or - authority to forbid You from making technical modifications - necessary to exercise the Licensed Rights, including - technical modifications necessary to circumvent Effective - Technological Measures. For purposes of this Public License, - simply making modifications authorized by this Section 2(a) - (4) never produces Adapted Material. - - 5. Downstream recipients. - - a. Offer from the Licensor -- Licensed Material. Every - recipient of the Licensed Material automatically - receives an offer from the Licensor to exercise the - Licensed Rights under the terms and conditions of this - Public License. - - b. Additional offer from the Licensor -- Adapted Material. - Every recipient of Adapted Material from You - automatically receives an offer from the Licensor to - exercise the Licensed Rights in the Adapted Material - under the conditions of the Adapter's License You apply. - - c. No downstream restrictions. You may not offer or impose - any additional or different terms or conditions on, or - apply any Effective Technological Measures to, the - Licensed Material if doing so restricts exercise of the - Licensed Rights by any recipient of the Licensed - Material. - - 6. No endorsement. Nothing in this Public License constitutes or - may be construed as permission to assert or imply that You - are, or that Your use of the Licensed Material is, connected - with, or sponsored, endorsed, or granted official status by, - the Licensor or others designated to receive attribution as - provided in Section 3(a)(1)(A)(i). - - b. Other rights. - - 1. Moral rights, such as the right of integrity, are not - licensed under this Public License, nor are publicity, - privacy, and/or other similar personality rights; however, to - the extent possible, the Licensor waives and/or agrees not to - assert any such rights held by the Licensor to the limited - extent necessary to allow You to exercise the Licensed - Rights, but not otherwise. - - 2. Patent and trademark rights are not licensed under this - Public License. - - 3. To the extent possible, the Licensor waives any right to - collect royalties from You for the exercise of the Licensed - Rights, whether directly or through a collecting society - under any voluntary or waivable statutory or compulsory - licensing scheme. In all other cases the Licensor expressly - reserves any right to collect such royalties. - - -Section 3 -- License Conditions. - -Your exercise of the Licensed Rights is expressly made subject to the -following conditions. - - a. Attribution. - - 1. If You Share the Licensed Material (including in modified - form), You must: - - a. retain the following if it is supplied by the Licensor - with the Licensed Material: - - i. identification of the creator(s) of the Licensed - Material and any others designated to receive - attribution, in any reasonable manner requested by - the Licensor (including by pseudonym if - designated); - - ii. a copyright notice; - - iii. a notice that refers to this Public License; - - iv. a notice that refers to the disclaimer of - warranties; - - v. a URI or hyperlink to the Licensed Material to the - extent reasonably practicable; - - b. indicate if You modified the Licensed Material and - retain an indication of any previous modifications; and - - c. indicate the Licensed Material is licensed under this - Public License, and include the text of, or the URI or - hyperlink to, this Public License. - - 2. You may satisfy the conditions in Section 3(a)(1) in any - reasonable manner based on the medium, means, and context in - which You Share the Licensed Material. For example, it may be - reasonable to satisfy the conditions by providing a URI or - hyperlink to a resource that includes the required - information. - - 3. If requested by the Licensor, You must remove any of the - information required by Section 3(a)(1)(A) to the extent - reasonably practicable. - - b. ShareAlike. - - In addition to the conditions in Section 3(a), if You Share - Adapted Material You produce, the following conditions also apply. - - 1. The Adapter's License You apply must be a Creative Commons - license with the same License Elements, this version or - later, or a BY-SA Compatible License. - - 2. You must include the text of, or the URI or hyperlink to, the - Adapter's License You apply. You may satisfy this condition - in any reasonable manner based on the medium, means, and - context in which You Share Adapted Material. - - 3. You may not offer or impose any additional or different terms - or conditions on, or apply any Effective Technological - Measures to, Adapted Material that restrict exercise of the - rights granted under the Adapter's License You apply. - - -Section 4 -- Sui Generis Database Rights. - -Where the Licensed Rights include Sui Generis Database Rights that -apply to Your use of the Licensed Material: - - a. for the avoidance of doubt, Section 2(a)(1) grants You the right - to extract, reuse, reproduce, and Share all or a substantial - portion of the contents of the database; - - b. if You include all or a substantial portion of the database - contents in a database in which You have Sui Generis Database - Rights, then the database in which You have Sui Generis Database - Rights (but not its individual contents) is Adapted Material, - - including for purposes of Section 3(b); and - c. You must comply with the conditions in Section 3(a) if You Share - all or a substantial portion of the contents of the database. - -For the avoidance of doubt, this Section 4 supplements and does not -replace Your obligations under this Public License where the Licensed -Rights include other Copyright and Similar Rights. - - -Section 5 -- Disclaimer of Warranties and Limitation of Liability. - - a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE - EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS - AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF - ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, - IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, - WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR - PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, - ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT - KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT - ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. - - b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE - TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, - NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, - INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, - COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR - USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN - ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR - DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR - IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. - - c. The disclaimer of warranties and limitation of liability provided - above shall be interpreted in a manner that, to the extent - possible, most closely approximates an absolute disclaimer and - waiver of all liability. - - -Section 6 -- Term and Termination. - - a. This Public License applies for the term of the Copyright and - Similar Rights licensed here. However, if You fail to comply with - this Public License, then Your rights under this Public License - terminate automatically. - - b. Where Your right to use the Licensed Material has terminated under - Section 6(a), it reinstates: - - 1. automatically as of the date the violation is cured, provided - it is cured within 30 days of Your discovery of the - violation; or - - 2. upon express reinstatement by the Licensor. - - For the avoidance of doubt, this Section 6(b) does not affect any - right the Licensor may have to seek remedies for Your violations - of this Public License. - - c. For the avoidance of doubt, the Licensor may also offer the - Licensed Material under separate terms or conditions or stop - distributing the Licensed Material at any time; however, doing so - will not terminate this Public License. - - d. Sections 1, 5, 6, 7, and 8 survive termination of this Public - License. - - -Section 7 -- Other Terms and Conditions. - - a. The Licensor shall not be bound by any additional or different - terms or conditions communicated by You unless expressly agreed. - - b. Any arrangements, understandings, or agreements regarding the - Licensed Material not stated herein are separate from and - independent of the terms and conditions of this Public License. - - -Section 8 -- Interpretation. - - a. For the avoidance of doubt, this Public License does not, and - shall not be interpreted to, reduce, limit, restrict, or impose - conditions on any use of the Licensed Material that could lawfully - be made without permission under this Public License. - - b. To the extent possible, if any provision of this Public License is - deemed unenforceable, it shall be automatically reformed to the - minimum extent necessary to make it enforceable. If the provision - cannot be reformed, it shall be severed from this Public License - without affecting the enforceability of the remaining terms and - conditions. - - c. No term or condition of this Public License will be waived and no - failure to comply consented to unless expressly agreed to by the - Licensor. - - d. Nothing in this Public License constitutes or may be interpreted - as a limitation upon, or waiver of, any privileges and immunities - that apply to the Licensor or You, including from the legal - processes of any jurisdiction or authority. - - -======================================================================= - -Creative Commons is not a party to its public licenses. -Notwithstanding, Creative Commons may elect to apply one of its public -licenses to material it publishes and in those instances will be -considered the "Licensor." Except for the limited purpose of indicating -that material is shared under a Creative Commons public license or as -otherwise permitted by the Creative Commons policies published at -creativecommons.org/policies, Creative Commons does not authorize the -use of the trademark "Creative Commons" or any other trademark or logo -of Creative Commons without its prior written consent including, -without limitation, in connection with any unauthorized modifications -to any of its public licenses or any other arrangements, -understandings, or agreements concerning use of licensed material. For -the avoidance of doubt, this paragraph does not form part of the public -licenses. - -Creative Commons may be contacted at creativecommons.org. - diff --git a/tests/test_corpus/README.md b/tests/test_corpus/README.md deleted file mode 100644 index cc5f09aed..000000000 --- a/tests/test_corpus/README.md +++ /dev/null @@ -1,209 +0,0 @@ -**Note: This is a subset of the full UD_English-EWT corpus, fit for testing purposes. Below is the original readme:** - - -Universal Dependencies - English Dependency Treebank -Universal Dependencies English Web Treebank v2.2 -- 2018-04-15 -https://github.com/UniversalDependencies/UD_English-EWT - -# Summary - -A Gold Standard Universal Dependencies Corpus for English, -built over the source material of the English Web Treebank -LDC2012T13 (https://catalog.ldc.upenn.edu/LDC2012T13). - - -# Introduction - -The corpus comprises 254,830 words and 16,622 sentences, taken from five genres -of web media: weblogs, newsgroups, emails, reviews, and Yahoo! answers. See the -LDC2012T13 documentation for more details on the sources of the sentences. The -trees were automatically converted into Stanford Dependencies and then -hand-corrected to Universal Dependencies. All the basic dependency annotations have been single-annotated, a limited portion of them have been double-annotated, -and subsequent correction has been done to improve consistency. Other aspects -of the treebank, such as Universal POS, features and enhanced dependencies, has mainly been done -automatically, with very limited hand-correction. - -# License/Copyright - -Universal Dependencies English Web Treebank © 2013-2018 -by The Board of Trustees of The Leland Stanford Junior University. -All Rights Reserved. - -The annotations and database rights of the Universal Dependencies -English Web Treebank are licensed under a -Creative Commons Attribution-ShareAlike 4.0 International License. - -You should have received a copy of the license along with this -work. If not, see . - -The underlying texts come from various sources collected for the -LDC English Web Treebank. Some parts are in the public domain. -Portions may be © 2012 Google Inc., © 2011 Yahoo! Inc., -© 2012 Trustees of the University of Pennsylvania and/or -© other original authors. - - -# Structure - -This directory contains a corpus of sentences annotated using Universal -Dependencies annotation. The corpus comprises 254,830 words and 16,622 -sentences, taken from various web media including weblogs, newsgroups, emails, -reviews, and Yahoo! answers; see the LDC2012T13 documentation for more details -on the source of the sentences. The trees were automatically converted into -Stanford Dependencies and then hand-corrected to Universal Dependencies. All -the dependency annotations have been single-annotated, and a limited portion of -them have been double-annotated with interannotator agreement at approximately -96%. The sentence IDs include the genre and the filename -of the original LDC2012T13 filename. - -This corpus is compatible with the CoNLL-U format defined for Universal -Dependencies. See: - - http://universaldependencies.github.io/docs/format.html - -The dependency taxonomy can be found on the Universal Dependencies web site: - - http://www.universaldependencies.org - -For the conversion to v2, we performed an automatic conversion with extensive -spot-checking, and manual adjudication of ambiguous cases. - -The enhanced dependencies were automatically obtained by running an adapted version -of the converter by Schuster and Manning (2016). These dependencies have **not** been -manually checked. - -# Deviations from UD - -Version 2.2 of the English UD treebank conforms to the UD guidelines in -almost all respects, but there remain the following deviations: - - - The UD dependency `flat` is largely only used for person names. - - Dates are not annotated consistently. - - The attachment of punctuation tokens is sometimes not according to the - guidelines. - -# Changelog - -**2018-04-15 v2.2** - - Repository renamed from UD_English to UD_English-EWT - - Automatically added enhanced dependencies (These have not been manually checked!) - - Fixed some wrong lemmata and POS tags - - Fixed miscellaneous syntactic issues - -**2017-11-15 v2.1** - - - Fixed some wrong lemmata, POS tags - - Fixed miscellaneous syntactic issues - - Added basic dependencies into the `DEPS` column according to the CONLL-U v2 - format - -**2017-02-15 v2.0** - - - Updated treebank to conform to v2 guidelines - - Fixed some wrong lemmata - - Fixed miscellaneous syntactic issues - - Added empty nodes for gapped constructions in enhanced representation - -**2016-11-15 v1.4** - - - Changed POS tag of fused det-noun pronouns (e.g., *"somebody"*, *"nothing"*) - to `PRON` - - Added original, untokenized sentences to CoNLL-U files - - Fixed some POS errors, features and wrong lemmata - - Fixed miscellaneous syntactic issues in a few sentences - -**2016-05-15 v1.3** - - - Improved mapping of `WDT` to UPOS - - Corrected lemma of *"n't"* to *"not"* - - Fixed some errors between `advcl`, `ccomp` and `parataxis` - - Fixed inconsistent analyses of sentences repeated between dev and train sets - - Fixed miscellaneous syntactic issues in a few sentences - - -**2015-11-15 v1.2** - - - Bugfix: removed *_NFP* suffix from some lemmas - - Fixed date annotations to adopt UD standard - - Remove escaping of *(* and *)* from word tokens (XPOSTAGs are still `-LRB-` - and `-RRB-`) - - Improved precision of `xcomp` relation - - Improved recall of `name` relation - - Corrected lemmas for reduced auxiliaries - - Corrected UPOS tags of pronominal uses of *this/that/these/those* (from `DET` - to `PRON`) - - Corrected UPOS tags of subordinating conjunctions (from `ADP` to `SCONJ`) - - Corrected UPOS tags of some main verbs (from `AUX` to `VERB`) - -# Contributing - -To help improve the corpus, please alert us to any errors you find in it. -The best way to do this is to file a github issue at: - - https://github.com/UniversalDependencies/UD_English-EWT/issues - -We also welcome pull requests. If you want to make edits, please modify the -trees in the idividual files in the `not-to-release/sources` directory instead -of making direct changes to `en-ud-{dev,test,train}.conllu`. - -# Acknowledgments - -Annotation of the Universal Dependencies English Web Treebank was carried out by -(in order of size of contribution): - - - Natalia Silveira - - Timothy Dozat - - Sebastian Schuster - - Miriam Connor - - Marie-Catherine de Marneffe - - Nathan Schneider - - Samuel Bowman - - Hanzhi Zhu - - Daniel Galbraith - - Christopher Manning - - John Bauer - -Creation of the CoNLL-U files, including calculating UPOS, feature, and lemma -information was primarily done by - - - Sebastian Schuster - - Natalia Silveira - -The construction of the Universal Dependencies English Web Treebank was -partially funded by a gift from Google, Inc., which we gratefully acknowledge. - - -# Citations - -You are encouraged to cite this paper if you use the Universal Dependencies -English Web Treebank: - - - @inproceedings{silveira14gold, - year = {2014}, - author = {Natalia Silveira and Timothy Dozat and Marie-Catherine de - Marneffe and Samuel Bowman and Miriam Connor and John Bauer and - Christopher D. Manning}, - title = {A Gold Standard Dependency Corpus for {E}nglish}, - booktitle = {Proceedings of the Ninth International Conference on Language - Resources and Evaluation (LREC-2014)} - } - -# Metadata - -``` -=== Machine-readable metadata (DO NOT REMOVE!) ================================ -Data available since: UD v1.0 -License: CC BY-SA 4.0 -Includes text: yes -Genre: blog social reviews email -Lemmas: automatic with corrections -UPOS: converted with corrections -XPOS: manual native -Features: automatic -Relations: manual native -Contributors: Silveira, Natalia; Dozat, Timothy; Manning, Christopher; Schuster, Sebastian; Bauer, John; Connor, Miriam; de Marneffe, Marie-Catherine; Schneider, Nathan; Bowman, Sam; Zhu, Hanzhi; Galbraith, Daniel -Contributing: here source -Contact: syntacticdependencies@lists.stanford.edu -=============================================================================== -``` diff --git a/tests/test_corpus/en_ewt-ud-dev.conllu b/tests/test_corpus/en_ewt-ud-dev.conllu deleted file mode 100644 index 8949f8cfd..000000000 --- a/tests/test_corpus/en_ewt-ud-dev.conllu +++ /dev/null @@ -1,793 +0,0 @@ -# newdoc id = weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713 -# sent_id = weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-0001 -# text = From the AP comes this story : -1 From from ADP IN _ 3 case 3:case _ -2 the the DET DT Definite=Def|PronType=Art 3 det 3:det _ -3 AP AP PROPN NNP Number=Sing 4 obl 4:obl:from _ -4 comes come VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -5 this this DET DT Number=Sing|PronType=Dem 6 det 6:det _ -6 story story NOUN NN Number=Sing 4 nsubj 4:nsubj _ -7 : : PUNCT : _ 4 punct 4:punct _ - -# sent_id = weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-0002 -# text = President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area. -1 President President PROPN NNP Number=Sing 5 nsubj 5:nsubj _ -2 Bush Bush PROPN NNP Number=Sing 1 flat 1:flat _ -3 on on ADP IN _ 4 case 4:case _ -4 Tuesday Tuesday PROPN NNP Number=Sing 5 obl 5:obl:on _ -5 nominated nominate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -6 two two NUM CD NumType=Card 7 nummod 7:nummod _ -7 individuals individual NOUN NNS Number=Plur 5 obj 5:obj _ -8 to to PART TO _ 9 mark 9:mark _ -9 replace replace VERB VB VerbForm=Inf 5 advcl 5:advcl:to _ -10 retiring retire VERB VBG VerbForm=Ger 11 amod 11:amod _ -11 jurists jurist NOUN NNS Number=Plur 9 obj 9:obj _ -12 on on ADP IN _ 14 case 14:case _ -13 federal federal ADJ JJ Degree=Pos 14 amod 14:amod _ -14 courts court NOUN NNS Number=Plur 11 nmod 11:nmod:on _ -15 in in ADP IN _ 18 case 18:case _ -16 the the DET DT Definite=Def|PronType=Art 18 det 18:det _ -17 Washington Washington PROPN NNP Number=Sing 18 compound 18:compound _ -18 area area NOUN NN Number=Sing 14 nmod 14:nmod:in SpaceAfter=No -19 . . PUNCT . _ 5 punct 5:punct _ - -# sent_id = weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-0003 -# text = Bush nominated Jennifer M. Anderson for a 15-year term as associate judge of the Superior Court of the District of Columbia, replacing Steffen W. Graae. -1 Bush Bush PROPN NNP Number=Sing 2 nsubj 2:nsubj _ -2 nominated nominate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -3 Jennifer Jennifer PROPN NNP Number=Sing 2 obj 2:obj _ -4 M. M. PROPN NNP Number=Sing 3 flat 3:flat _ -5 Anderson Anderson PROPN NNP Number=Sing 3 flat 3:flat _ -6 for for ADP IN _ 11 case 11:case _ -7 a a DET DT Definite=Ind|PronType=Art 11 det 11:det _ -8 15 15 NUM CD NumType=Card 10 nummod 10:nummod SpaceAfter=No -9 - - PUNCT HYPH _ 10 punct 10:punct SpaceAfter=No -10 year year NOUN NN Number=Sing 11 compound 11:compound _ -11 term term NOUN NN Number=Sing 2 obl 2:obl:for _ -12 as as ADP IN _ 14 case 14:case _ -13 associate associate ADJ JJ Degree=Pos 14 amod 14:amod _ -14 judge judge NOUN NN Number=Sing 11 nmod 11:nmod:as _ -15 of of ADP IN _ 18 case 18:case _ -16 the the DET DT Definite=Def|PronType=Art 18 det 18:det _ -17 Superior Superior PROPN NNP Number=Sing 18 compound 18:compound _ -18 Court Court PROPN NNP Number=Sing 14 nmod 14:nmod:of _ -19 of of ADP IN _ 21 case 21:case _ -20 the the DET DT Definite=Def|PronType=Art 21 det 21:det _ -21 District District PROPN NNP Number=Sing 18 nmod 18:nmod:of _ -22 of of ADP IN _ 23 case 23:case _ -23 Columbia Columbia PROPN NNP Number=Sing 21 nmod 21:nmod:of SpaceAfter=No -24 , , PUNCT , _ 2 punct 2:punct _ -25 replacing replace VERB VBG VerbForm=Ger 2 advcl 2:advcl _ -26 Steffen Steffen PROPN NNP Number=Sing 25 obj 25:obj _ -27 W. W. PROPN NNP Number=Sing 26 flat 26:flat _ -28 Graae Graae PROPN NNP Number=Sing 26 flat 26:flat SpaceAfter=No -29 . . PUNCT . _ 2 punct 2:punct _ - -# sent_id = weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-0004 -# text = *** -1 *** *** PUNCT NFP _ 0 root 0:root _ - -# sent_id = weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-0005 -# text = Bush also nominated A. Noel Anketell Kramer for a 15-year term as associate judge of the District of Columbia Court of Appeals, replacing John Montague Steadman. -1 Bush Bush PROPN NNP Number=Sing 3 nsubj 3:nsubj _ -2 also also ADV RB _ 3 advmod 3:advmod _ -3 nominated nominate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -4 A. A. PROPN NNP Number=Sing 3 obj 3:obj _ -5 Noel Noel PROPN NNP Number=Sing 4 flat 4:flat _ -6 Anketell Anketell PROPN NNP Number=Sing 4 flat 4:flat _ -7 Kramer Kramer PROPN NNP Number=Sing 4 flat 4:flat _ -8 for for ADP IN _ 13 case 13:case _ -9 a a DET DT Definite=Ind|PronType=Art 13 det 13:det _ -10 15 15 NUM CD NumType=Card 12 nummod 12:nummod SpaceAfter=No -11 - - PUNCT HYPH _ 12 punct 12:punct SpaceAfter=No -12 year year NOUN NN Number=Sing 13 compound 13:compound _ -13 term term NOUN NN Number=Sing 3 obl 3:obl:for _ -14 as as ADP IN _ 16 case 16:case _ -15 associate associate ADJ JJ Degree=Pos 16 amod 16:amod _ -16 judge judge NOUN NN Number=Sing 13 nmod 13:nmod:as _ -17 of of ADP IN _ 19 case 19:case _ -18 the the DET DT Definite=Def|PronType=Art 19 det 19:det _ -19 District District PROPN NNP Number=Sing 16 nmod 16:nmod:of _ -20 of of ADP IN _ 22 case 22:case _ -21 Columbia Columbia PROPN NNP Number=Sing 22 compound 22:compound _ -22 Court Court PROPN NNP Number=Sing 19 nmod 19:nmod:of _ -23 of of ADP IN _ 24 case 24:case _ -24 Appeals Appeals PROPN NNPS Number=Plur 22 nmod 22:nmod:of SpaceAfter=No -25 , , PUNCT , _ 3 punct 3:punct _ -26 replacing replace VERB VBG VerbForm=Ger 3 advcl 3:advcl _ -27 John John PROPN NNP Number=Sing 26 obj 26:obj _ -28 Montague Montague PROPN NNP Number=Sing 27 flat 27:flat _ -29 Steadman Steadman PROPN NNP Number=Sing 27 flat 27:flat SpaceAfter=No -30 . . PUNCT . _ 3 punct 3:punct _ - -# newdoc id = weblog-blogspot.com_gettingpolitical_20030906235000_ENG_20030906_235000 -# sent_id = weblog-blogspot.com_gettingpolitical_20030906235000_ENG_20030906_235000-0001 -# text = The sheikh in wheel-chair has been attacked with a F-16-launched bomb. -1 The the DET DT Definite=Def|PronType=Art 2 det 2:det _ -2 sheikh sheikh NOUN NN Number=Sing 9 nsubj:pass 9:nsubj:pass _ -3 in in ADP IN _ 6 case 6:case _ -4 wheel wheel NOUN NN Number=Sing 6 compound 6:compound SpaceAfter=No -5 - - PUNCT HYPH _ 6 punct 6:punct SpaceAfter=No -6 chair chair NOUN NN Number=Sing 2 nmod 2:nmod:in _ -7 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 9 aux 9:aux _ -8 been be AUX VBN Tense=Past|VerbForm=Part 9 aux:pass 9:aux:pass _ -9 attacked attack VERB VBN Tense=Past|VerbForm=Part 0 root 0:root _ -10 with with ADP IN _ 17 case 17:case _ -11 a a DET DT Definite=Ind|PronType=Art 17 det 17:det _ -12 F f NOUN NN Number=Sing 16 compound 16:compound SpaceAfter=No -13 - - PUNCT HYPH _ 12 punct 12:punct SpaceAfter=No -14 16 16 NUM CD NumType=Card 12 compound 12:compound SpaceAfter=No -15 - - PUNCT HYPH _ 16 punct 16:punct SpaceAfter=No -16 launched launch VERB VBN Tense=Past|VerbForm=Part 17 acl 17:acl _ -17 bomb bomb NOUN NN Number=Sing 9 obl 9:obl:with SpaceAfter=No -18 . . PUNCT . _ 9 punct 9:punct _ - -# sent_id = weblog-blogspot.com_gettingpolitical_20030906235000_ENG_20030906_235000-0002 -# text = He could be killed years ago and the israelians have all the reasons, since he founded and he is the spiritual leader of Hamas, but they didn't. -1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 4 nsubj:pass 4:nsubj:pass _ -2 could could AUX MD VerbForm=Fin 4 aux 4:aux _ -3 be be AUX VB VerbForm=Inf 4 aux:pass 4:aux:pass _ -4 killed kill VERB VBN Tense=Past|VerbForm=Part 0 root 0:root _ -5 years year NOUN NNS Number=Plur 6 obl:tmod 6:obl:tmod _ -6 ago ago ADV RB _ 4 advmod 4:advmod _ -7 and and CCONJ CC _ 10 cc 10:cc _ -8 the the DET DT Definite=Def|PronType=Art 9 det 9:det _ -9 israelians israelians PROPN NNPS Number=Plur 10 nsubj 10:nsubj _ -10 have have VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 4 conj 4:conj:and _ -11 all all DET PDT _ 13 det:predet 13:det:predet _ -12 the the DET DT Definite=Def|PronType=Art 13 det 13:det _ -13 reasons reason NOUN NNS Number=Plur 10 obj 10:obj SpaceAfter=No -14 , , PUNCT , _ 10 punct 10:punct _ -15 since since SCONJ IN _ 17 mark 17:mark _ -16 he he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 17 nsubj 17:nsubj _ -17 founded found VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 10 advcl 10:advcl:since _ -18 and and CCONJ CC _ 23 cc 23:cc _ -19 he he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 23 nsubj 23:nsubj _ -20 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 23 cop 23:cop _ -21 the the DET DT Definite=Def|PronType=Art 23 det 23:det _ -22 spiritual spiritual ADJ JJ Degree=Pos 23 amod 23:amod _ -23 leader leader NOUN NN Number=Sing 17 conj 10:advcl:since|17:conj:and _ -24 of of ADP IN _ 25 case 25:case _ -25 Hamas Hamas PROPN NNP Number=Sing 23 nmod 23:nmod:of SpaceAfter=No -26 , , PUNCT , _ 29 punct 29:punct _ -27 but but CCONJ CC _ 29 cc 29:cc _ -28 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 29 nsubj 29:nsubj _ -29 did do VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 4 conj 4:conj:but SpaceAfter=No -30 n't not PART RB _ 29 advmod 29:advmod SpaceAfter=No -31 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-blogspot.com_gettingpolitical_20030906235000_ENG_20030906_235000-0003 -# text = Today's incident proves that Sharon has lost his patience and his hope in peace. -1 Today today NOUN NN Number=Sing 3 nmod:poss 3:nmod:poss SpaceAfter=No -2 's 's PART POS _ 1 case 1:case _ -3 incident incident NOUN NN Number=Sing 4 nsubj 4:nsubj _ -4 proves prove VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -5 that that SCONJ IN _ 8 mark 8:mark _ -6 Sharon Sharon PROPN NNP Number=Sing 8 nsubj 8:nsubj _ -7 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 8 aux 8:aux _ -8 lost lose VERB VBN Tense=Past|VerbForm=Part 4 ccomp 4:ccomp _ -9 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 10 nmod:poss 10:nmod:poss _ -10 patience patience NOUN NN Number=Sing 8 obj 8:obj _ -11 and and CCONJ CC _ 13 cc 13:cc _ -12 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 13 nmod:poss 13:nmod:poss _ -13 hope hope NOUN NN Number=Sing 10 conj 8:obj|10:conj:and _ -14 in in ADP IN _ 15 case 15:case _ -15 peace peace NOUN NN Number=Sing 13 nmod 13:nmod:in SpaceAfter=No -16 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-blogspot.com_gettingpolitical_20030906235000_ENG_20030906_235000-0004 -# text = Nervous people make mistakes, so I suppose there will be a wave of succesfull arab attacks. -1 Nervous nervous ADJ JJ Degree=Pos 2 amod 2:amod _ -2 people people NOUN NNS Number=Plur 3 nsubj 3:nsubj _ -3 make make VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 0 root 0:root _ -4 mistakes mistake NOUN NNS Number=Plur 3 obj 3:obj SpaceAfter=No -5 , , PUNCT , _ 3 punct 3:punct _ -6 so so ADV RB _ 8 advmod 8:advmod _ -7 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 8 nsubj 8:nsubj _ -8 suppose suppose VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 3 parataxis 3:parataxis _ -9 there there PRON EX _ 11 expl 11:expl _ -10 will will AUX MD VerbForm=Fin 11 aux 11:aux _ -11 be be VERB VB VerbForm=Inf 8 ccomp 8:ccomp _ -12 a a DET DT Definite=Ind|PronType=Art 13 det 13:det _ -13 wave wave NOUN NN Number=Sing 11 nsubj 11:nsubj _ -14 of of ADP IN _ 17 case 17:case _ -15 succesfull succesfull ADJ JJ Degree=Pos 17 amod 17:amod _ -16 arab arab ADJ JJ Degree=Pos 17 amod 17:amod _ -17 attacks attack NOUN NNS Number=Plur 13 nmod 13:nmod:of SpaceAfter=No -18 . . PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-blogspot.com_gettingpolitical_20030906235000_ENG_20030906_235000-0005 -# text = A la guerre c'est comme a la guerre! -1 A a X FW Foreign=Yes 0 root 0:root _ -2 la la X FW Foreign=Yes 1 flat:foreign 1:flat:foreign _ -3 guerre guerre X FW Foreign=Yes 1 flat:foreign 1:flat:foreign _ -4 c'est c'est X FW Foreign=Yes 1 flat:foreign 1:flat:foreign _ -5 comme comme X FW Foreign=Yes 1 flat:foreign 1:flat:foreign _ -6 a a X FW Foreign=Yes 1 flat:foreign 1:flat:foreign _ -7 la la X FW Foreign=Yes 1 flat:foreign 1:flat:foreign _ -8 guerre guerre X FW Foreign=Yes 1 flat:foreign 1:flat:foreign SpaceAfter=No -9 ! ! PUNCT . _ 1 punct 1:punct _ - -# newdoc id = weblog-juancole.com_juancole_20040114085100_ENG_20040114_085100 -# sent_id = weblog-juancole.com_juancole_20040114085100_ENG_20040114_085100-0001 -# text = In the eastern city of Baqubah, guerrillas detonated a car bomb outside a police station, killing several people. -1 In in ADP IN _ 4 case 4:case _ -2 the the DET DT Definite=Def|PronType=Art 4 det 4:det _ -3 eastern eastern ADJ JJ Degree=Pos 4 amod 4:amod _ -4 city city NOUN NN Number=Sing 9 obl 9:obl:in _ -5 of of ADP IN _ 6 case 6:case _ -6 Baqubah Baqubah PROPN NNP Number=Sing 4 nmod 4:nmod:of SpaceAfter=No -7 , , PUNCT , _ 9 punct 9:punct _ -8 guerrillas guerrilla NOUN NNS Number=Plur 9 nsubj 9:nsubj _ -9 detonated detonate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -10 a a DET DT Definite=Ind|PronType=Art 12 det 12:det _ -11 car car NOUN NN Number=Sing 12 compound 12:compound _ -12 bomb bomb NOUN NN Number=Sing 9 obj 9:obj _ -13 outside outside ADP IN _ 16 case 16:case _ -14 a a DET DT Definite=Ind|PronType=Art 16 det 16:det _ -15 police police NOUN NN Number=Sing 16 compound 16:compound _ -16 station station NOUN NN Number=Sing 9 obl 9:obl:outside SpaceAfter=No -17 , , PUNCT , _ 9 punct 9:punct _ -18 killing kill VERB VBG VerbForm=Ger 9 advcl 9:advcl _ -19 several several ADJ JJ Degree=Pos 20 amod 20:amod _ -20 people people NOUN NNS Number=Plur 18 obj 18:obj SpaceAfter=No -21 . . PUNCT . _ 9 punct 9:punct _ - -# sent_id = weblog-juancole.com_juancole_20040114085100_ENG_20040114_085100-0002 -# text = The US lost yet another helicopter to hostile fire near Habbaniyah in the Sunni heartland, but this time the crew was safe. -1 The the DET DT Definite=Def|PronType=Art 2 det 2:det _ -2 US US PROPN NNP Number=Sing 3 nsubj 3:nsubj _ -3 lost lose VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -4 yet yet ADV RB _ 6 advmod 6:advmod _ -5 another another DET DT _ 6 det 6:det _ -6 helicopter helicopter NOUN NN Number=Sing 3 obj 3:obj _ -7 to to ADP IN _ 9 case 9:case _ -8 hostile hostile ADJ JJ Degree=Pos 9 amod 9:amod _ -9 fire fire NOUN NN Number=Sing 3 obl 3:obl:to _ -10 near near ADP IN _ 11 case 11:case _ -11 Habbaniyah Habbaniyah PROPN NNP Number=Sing 9 nmod 9:nmod:near _ -12 in in ADP IN _ 15 case 15:case _ -13 the the DET DT Definite=Def|PronType=Art 15 det 15:det _ -14 Sunni sunni ADJ JJ Degree=Pos 15 amod 15:amod _ -15 heartland heartland NOUN NN Number=Sing 11 nmod 11:nmod:in SpaceAfter=No -16 , , PUNCT , _ 23 punct 23:punct _ -17 but but CCONJ CC _ 23 cc 23:cc _ -18 this this DET DT Number=Sing|PronType=Dem 19 det 19:det _ -19 time time NOUN NN Number=Sing 23 obl:tmod 23:obl:tmod _ -20 the the DET DT Definite=Def|PronType=Art 21 det 21:det _ -21 crew crew NOUN NN Number=Sing 23 nsubj 23:nsubj _ -22 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 23 cop 23:cop _ -23 safe safe ADJ JJ Degree=Pos 3 conj 3:conj:but SpaceAfter=No -24 . . PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-juancole.com_juancole_20040114085100_ENG_20040114_085100-0003 -# text = In Fallujah, hundreds of demonstrators came out against US troops when they briefly arrested a yound newlywed bride. -1 In in ADP IN _ 2 case 2:case _ -2 Fallujah Fallujah PROPN NNP Number=Sing 7 obl 7:obl:in SpaceAfter=No -3 , , PUNCT , _ 7 punct 7:punct _ -4 hundreds hundred NOUN NNS Number=Plur 7 nsubj 7:nsubj _ -5 of of ADP IN _ 6 case 6:case _ -6 demonstrators demonstrator NOUN NNS Number=Plur 4 nmod 4:nmod:of _ -7 came come VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -8 out out ADV RB _ 7 advmod 7:advmod _ -9 against against ADP IN _ 11 case 11:case _ -10 US US PROPN NNP Number=Sing 11 compound 11:compound _ -11 troops troops NOUN NN Number=Sing 7 obl 7:obl:against _ -12 when when ADV WRB PronType=Int 15 mark 15:mark _ -13 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 15 nsubj 15:nsubj _ -14 briefly briefly ADV RB _ 15 advmod 15:advmod _ -15 arrested arrest VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 7 advcl 7:advcl:when _ -16 a a DET DT Definite=Ind|PronType=Art 19 det 19:det _ -17 yound yound ADJ JJ Degree=Pos 19 amod 19:amod _ -18 newlywed newlywed ADJ JJ Degree=Pos 19 amod 19:amod _ -19 bride bride NOUN NN Number=Sing 15 obj 15:obj SpaceAfter=No -20 . . PUNCT . _ 7 punct 7:punct _ - -# sent_id = weblog-juancole.com_juancole_20040114085100_ENG_20040114_085100-0004 -# text = (I hope that the US army got an enormous amount of information from her relatives, because otherwise this move was a bad, bad tradeoff). -1 ( ( PUNCT -LRB- _ 3 punct 3:punct SpaceAfter=No -2 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 3 nsubj 3:nsubj _ -3 hope hope VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 0 root 0:root _ -4 that that SCONJ IN _ 8 mark 8:mark _ -5 the the DET DT Definite=Def|PronType=Art 7 det 7:det _ -6 US US PROPN NNP Number=Sing 7 compound 7:compound _ -7 army army NOUN NN Number=Sing 8 nsubj 8:nsubj _ -8 got get VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 3 ccomp 3:ccomp _ -9 an a DET DT Definite=Ind|PronType=Art 11 det 11:det _ -10 enormous enormous ADJ JJ Degree=Pos 11 amod 11:amod _ -11 amount amount NOUN NN Number=Sing 8 obj 8:obj _ -12 of of ADP IN _ 13 case 13:case _ -13 information information NOUN NN Number=Sing 11 nmod 11:nmod:of _ -14 from from ADP IN _ 16 case 16:case _ -15 her she PRON PRP$ Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs 16 nmod:poss 16:nmod:poss _ -16 relatives relative NOUN NNS Number=Plur 8 obl 8:obl:from SpaceAfter=No -17 , , PUNCT , _ 3 punct 3:punct _ -18 because because SCONJ IN _ 27 mark 27:mark _ -19 otherwise otherwise ADV RB _ 27 advmod 27:advmod _ -20 this this DET DT Number=Sing|PronType=Dem 21 det 21:det _ -21 move move NOUN NN Number=Sing 27 nsubj 27:nsubj _ -22 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 27 cop 27:cop _ -23 a a DET DT Definite=Ind|PronType=Art 27 det 27:det _ -24 bad bad ADJ JJ Degree=Pos 27 amod 27:amod SpaceAfter=No -25 , , PUNCT , _ 27 punct 27:punct _ -26 bad bad ADJ JJ Degree=Pos 27 amod 27:amod _ -27 tradeoff tradeoff NOUN NN Number=Sing 3 advcl 3:advcl:because SpaceAfter=No -28 ) ) PUNCT -RRB- _ 3 punct 3:punct SpaceAfter=No -29 . . PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-juancole.com_juancole_20040114085100_ENG_20040114_085100-0005 -# text = The US troops fired into the hostile crowd, killing 4. -1 The the DET DT Definite=Def|PronType=Art 3 det 3:det _ -2 US US PROPN NNP Number=Sing 3 compound 3:compound _ -3 troops troops NOUN NNS Number=Plur 4 nsubj 4:nsubj _ -4 fired fire VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -5 into into ADP IN _ 8 case 8:case _ -6 the the DET DT Definite=Def|PronType=Art 8 det 8:det _ -7 hostile hostile ADJ JJ Degree=Pos 8 amod 8:amod _ -8 crowd crowd NOUN NN Number=Sing 4 obl 4:obl:into SpaceAfter=No -9 , , PUNCT , _ 4 punct 4:punct _ -10 killing kill VERB VBG VerbForm=Ger 4 advcl 4:advcl _ -11 4 4 NUM CD NumType=Card 10 obj 10:obj SpaceAfter=No -12 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-juancole.com_juancole_20040114085100_ENG_20040114_085100-0006 -# text = It seems clear to me that the manhunt for high Baath officials in the Sunni heartland is being done wrong, or at least in ways that are bad for US standing with local Iraqis. -1 It it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 2 expl 2:expl _ -2 seems seem VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -3 clear clear ADJ JJ Degree=Pos 2 xcomp 2:xcomp _ -4 to to ADP IN _ 5 case 5:case _ -5 me I PRON PRP Case=Acc|Number=Sing|Person=1|PronType=Prs 2 obl 2:obl:to _ -6 that that SCONJ IN _ 19 mark 19:mark _ -7 the the DET DT Definite=Def|PronType=Art 8 det 8:det _ -8 manhunt manhunt NOUN NN Number=Sing 19 nsubj:pass 19:nsubj:pass|20:nsubj:xsubj|26:nsubj:xsubj _ -9 for for ADP IN _ 12 case 12:case _ -10 high high ADJ JJ Degree=Pos 12 amod 12:amod _ -11 Baath Baath PROPN NNP Number=Sing 12 compound 12:compound _ -12 officials official NOUN NNS Number=Plur 8 nmod 8:nmod:for _ -13 in in ADP IN _ 16 case 16:case _ -14 the the DET DT Definite=Def|PronType=Art 16 det 16:det _ -15 Sunni sunni ADJ JJ Degree=Pos 16 amod 16:amod _ -16 heartland heartland NOUN NN Number=Sing 8 nmod 8:nmod:in _ -17 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 19 aux 19:aux _ -18 being be AUX VBG VerbForm=Ger 19 aux:pass 19:aux:pass _ -19 done do VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 2 csubj 2:csubj _ -20 wrong wrong ADV RB _ 19 xcomp 19:xcomp SpaceAfter=No -21 , , PUNCT , _ 26 punct 26:punct _ -22 or or CCONJ CC _ 26 cc 26:cc _ -23 at at ADV RB _ 26 advmod 26:advmod _ -24 least least ADV RBS Degree=Sup 23 fixed 23:fixed _ -25 in in ADP IN _ 26 case 26:case _ -26 ways way NOUN NNS Number=Plur 20 conj 19:xcomp|20:conj:or|29:nsubj _ -27 that that PRON WDT PronType=Rel 29 nsubj 26:ref _ -28 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 29 cop 29:cop _ -29 bad bad ADJ JJ Degree=Pos 26 acl:relcl 26:acl:relcl _ -30 for for ADP IN _ 32 case 32:case _ -31 US US PROPN NNP Number=Sing 32 compound 32:compound _ -32 standing standing NOUN NN Number=Sing 29 obl 29:obl:for _ -33 with with ADP IN _ 35 case 35:case _ -34 local local ADJ JJ Degree=Pos 35 amod 35:amod _ -35 Iraqis Iraqis PROPN NNPS Number=Plur 32 nmod 32:nmod:with SpaceAfter=No -36 . . PUNCT . _ 2 punct 2:punct _ - -# newdoc id = weblog-blogspot.com_marketview_20050210075500_ENG_20050210_075500 -# sent_id = weblog-blogspot.com_marketview_20050210075500_ENG_20050210_075500-0001 -# text = Google has finally had an analyst day -- a chance to present the company's story to the (miniscule number of) people who haven't heard it. -1 Google Google PROPN NNP Number=Sing 4 nsubj 4:nsubj _ -2 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 4 aux 4:aux _ -3 finally finally ADV RB _ 4 advmod 4:advmod _ -4 had have VERB VBN Tense=Past|VerbForm=Part 0 root 0:root _ -5 an a DET DT Definite=Ind|PronType=Art 7 det 7:det _ -6 analyst analyst NOUN NN Number=Sing 7 compound 7:compound _ -7 day day NOUN NN Number=Sing 4 obj 4:obj _ -8 -- -- PUNCT : _ 7 punct 7:punct _ -9 a a DET DT Definite=Ind|PronType=Art 10 det 10:det _ -10 chance chance NOUN NN Number=Sing 7 appos 7:appos _ -11 to to PART TO _ 12 mark 12:mark _ -12 present present VERB VB VerbForm=Inf 10 acl 10:acl:to _ -13 the the DET DT Definite=Def|PronType=Art 14 det 14:det _ -14 company company NOUN NN Number=Sing 16 nmod:poss 16:nmod:poss SpaceAfter=No -15 's 's PART POS _ 14 case 14:case _ -16 story story NOUN NN Number=Sing 12 obj 12:obj _ -17 to to ADP IN _ 21 case 21:case _ -18 the the DET DT Definite=Def|PronType=Art 21 det 21:det _ -19 ( ( PUNCT -LRB- _ 21 punct 21:punct SpaceAfter=No -20 miniscule miniscule ADJ JJ Degree=Pos 21 amod 21:amod _ -21 number number NOUN NN Number=Sing 12 obl 12:obl:to _ -22 of of ADP IN _ 24 case 24:case SpaceAfter=No -23 ) ) PUNCT -RRB- _ 24 punct 24:punct _ -24 people people NOUN NNS Number=Plur 21 nmod 21:nmod:of|28:nsubj _ -25 who who PRON WP PronType=Rel 28 nsubj 24:ref _ -26 have have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 28 aux 28:aux SpaceAfter=No -27 n't not PART RB _ 28 advmod 28:advmod _ -28 heard hear VERB VBN Tense=Past|VerbForm=Part 24 acl:relcl 24:acl:relcl _ -29 it it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 28 obj 28:obj SpaceAfter=No -30 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050210075500_ENG_20050210_075500-0002 -# text = Usually, these are just a chance for the suckups to suck up, but this time people are actually concerned about the company's plans. -1 Usually usually ADV RB _ 7 advmod 7:advmod SpaceAfter=No -2 , , PUNCT , _ 7 punct 7:punct _ -3 these these PRON DT Number=Plur|PronType=Dem 7 nsubj 7:nsubj _ -4 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 7 cop 7:cop _ -5 just just ADV RB _ 7 advmod 7:advmod _ -6 a a DET DT Definite=Ind|PronType=Art 7 det 7:det _ -7 chance chance NOUN NN Number=Sing 0 root 0:root _ -8 for for SCONJ IN _ 12 mark 12:mark _ -9 the the DET DT Definite=Def|PronType=Art 10 det 10:det _ -10 suckups suckup NOUN NNS Number=Plur 12 nsubj 12:nsubj _ -11 to to PART TO _ 12 mark 12:mark _ -12 suck suck VERB VB VerbForm=Inf 7 acl 7:acl:to _ -13 up up ADP RP _ 12 compound:prt 12:compound:prt SpaceAfter=No -14 , , PUNCT , _ 21 punct 21:punct _ -15 but but CCONJ CC _ 21 cc 21:cc _ -16 this this DET DT Number=Sing|PronType=Dem 17 det 17:det _ -17 time time NOUN NN Number=Sing 21 obl:tmod 21:obl:tmod _ -18 people people NOUN NNS Number=Plur 21 nsubj 21:nsubj _ -19 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 21 cop 21:cop _ -20 actually actually ADV RB _ 21 advmod 21:advmod _ -21 concerned concerned ADJ JJ Degree=Pos 7 conj 7:conj:but _ -22 about about ADP IN _ 26 case 26:case _ -23 the the DET DT Definite=Def|PronType=Art 24 det 24:det _ -24 company company NOUN NN Number=Sing 26 nmod:poss 26:nmod:poss SpaceAfter=No -25 's 's PART POS _ 24 case 24:case _ -26 plans plan NOUN NNS Number=Plur 21 obl 21:obl:about SpaceAfter=No -27 . . PUNCT . _ 7 punct 7:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050210075500_ENG_20050210_075500-0003 -# text = They work on Wall Street, after all, so when they hear a company who's stated goals include "Don't be evil," they imagine a company who's eventually history will be "Don't be profitable." -1 They they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 2 nsubj 2:nsubj _ -2 work work VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 0 root 0:root _ -3 on on ADP IN _ 5 case 5:case _ -4 Wall Wall PROPN NNP Number=Sing 5 compound 5:compound _ -5 Street Street PROPN NNP Number=Sing 2 obl 2:obl:on SpaceAfter=No -6 , , PUNCT , _ 2 punct 2:punct _ -7 after after ADV IN _ 8 case 8:case _ -8 all all ADV RB _ 2 obl 2:obl:after SpaceAfter=No -9 , , PUNCT , _ 2 punct 2:punct _ -10 so so ADV RB _ 28 advmod 28:advmod _ -11 when when ADV WRB PronType=Int 13 mark 13:mark _ -12 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 13 nsubj 13:nsubj _ -13 hear hear VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 28 advcl 28:advcl:when _ -14 a a DET DT Definite=Ind|PronType=Art 15 det 15:det _ -15 company company NOUN NN Number=Sing 13 obj 13:obj _ -16 who's who's PRON WP$ PronType=Int 18 nmod:poss 18:nmod:poss _ -17 stated state VERB VBN Tense=Past|VerbForm=Part 18 amod 18:amod _ -18 goals goal NOUN NNS Number=Plur 19 nsubj 19:nsubj _ -19 include include VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 15 acl:relcl 15:acl:relcl _ -20 " " PUNCT `` _ 24 punct 24:punct SpaceAfter=No -21 Do do AUX VB Mood=Imp|VerbForm=Fin 24 aux 24:aux SpaceAfter=No -22 n't not PART RB _ 24 advmod 24:advmod _ -23 be be AUX VB Mood=Imp|VerbForm=Fin 24 cop 24:cop _ -24 evil evil ADJ JJ Degree=Pos 19 ccomp 19:ccomp SpaceAfter=No -25 , , PUNCT , _ 13 punct 13:punct SpaceAfter=No -26 " " PUNCT '' _ 13 punct 13:punct _ -27 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 28 nsubj 28:nsubj _ -28 imagine imagine VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 2 parataxis 2:parataxis _ -29 a a DET DT Definite=Ind|PronType=Art 30 det 30:det _ -30 company company NOUN NN Number=Sing 28 obj 28:obj _ -31 who's who's PRON WP$ PronType=Int 33 nmod:poss 33:nmod:poss _ -32 eventually eventually ADJ JJ Degree=Pos 33 amod 33:amod _ -33 history history NOUN NN Number=Sing 35 nsubj 35:nsubj _ -34 will will AUX MD VerbForm=Fin 35 aux 35:aux _ -35 be be VERB VB VerbForm=Inf 30 acl:relcl 30:acl:relcl _ -36 " " PUNCT `` _ 40 punct 40:punct SpaceAfter=No -37 Do do AUX VB VerbForm=Inf 40 aux 40:aux SpaceAfter=No -38 n't not PART RB _ 40 advmod 40:advmod _ -39 be be AUX VB VerbForm=Inf 40 cop 40:cop _ -40 profitable profitable ADJ JJ Degree=Pos 35 ccomp 35:ccomp SpaceAfter=No -41 . . PUNCT . _ 2 punct 2:punct SpaceAfter=No -42 " " PUNCT '' _ 2 punct 2:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050210075500_ENG_20050210_075500-0004 -# text = It's not quite as freewheeling an environment as you'd imagine: Sergey Brin has actually created a mathematical 'proof' that the company's self-driven research strategy, which gives employees one day a week to do research projects on their own, is a good, respectable idea . -1 It it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 8 nsubj 8:nsubj SpaceAfter=No -2 's be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 8 cop 8:cop _ -3 not not PART RB _ 8 advmod 8:advmod _ -4 quite quite ADV RB _ 5 advmod 5:advmod _ -5 as as ADV RB _ 6 advmod 6:advmod _ -6 freewheeling freewheeling ADJ JJ Degree=Pos 8 amod 8:amod _ -7 an a DET DT Definite=Ind|PronType=Art 8 det 8:det _ -8 environment environment NOUN NN Number=Sing 0 root 0:root _ -9 as as SCONJ IN _ 12 mark 12:mark _ -10 you you PRON PRP Case=Nom|Person=2|PronType=Prs 12 nsubj 12:nsubj SpaceAfter=No -11 'd would AUX MD VerbForm=Fin 12 aux 12:aux _ -12 imagine imagine VERB VB VerbForm=Inf 5 advcl 5:advcl:as SpaceAfter=No -13 : : PUNCT : _ 8 punct 8:punct _ -14 Sergey Sergey PROPN NNP Number=Sing 18 nsubj 18:nsubj _ -15 Brin Brin PROPN NNP Number=Sing 14 flat 14:flat _ -16 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 18 aux 18:aux _ -17 actually actually ADV RB _ 18 advmod 18:advmod _ -18 created create VERB VBN Tense=Past|VerbForm=Part 8 parataxis 8:parataxis _ -19 a a DET DT Definite=Ind|PronType=Art 22 det 22:det _ -20 mathematical mathematical ADJ JJ Degree=Pos 22 amod 22:amod _ -21 ' ' PUNCT `` _ 22 punct 22:punct SpaceAfter=No -22 proof proof NOUN NN Number=Sing 18 obj 18:obj SpaceAfter=No -23 ' ' PUNCT '' _ 22 punct 22:punct _ -24 that that SCONJ IN _ 54 mark 54:mark _ -25 the the DET DT Definite=Def|PronType=Art 26 det 26:det _ -26 company company NOUN NN Number=Sing 32 nmod:poss 32:nmod:poss SpaceAfter=No -27 's 's PART POS _ 26 case 26:case _ -28 self self NOUN NN Number=Sing 30 compound 30:compound SpaceAfter=No -29 - - PUNCT HYPH _ 30 punct 30:punct SpaceAfter=No -30 driven drive VERB VBN Tense=Past|VerbForm=Part 32 amod 32:amod _ -31 research research NOUN NN Number=Sing 32 compound 32:compound _ -32 strategy strategy NOUN NN Number=Sing 54 nsubj 35:nsubj|54:nsubj SpaceAfter=No -33 , , PUNCT , _ 32 punct 32:punct _ -34 which which PRON WDT PronType=Rel 35 nsubj 32:ref _ -35 gives give VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 32 acl:relcl 32:acl:relcl _ -36 employees employee NOUN NNS Number=Plur 35 iobj 35:iobj _ -37 one one NUM CD NumType=Card 38 nummod 38:nummod _ -38 day day NOUN NN Number=Sing 35 obj 35:obj _ -39 a a DET DT Definite=Ind|PronType=Art 40 det 40:det _ -40 week week NOUN NN Number=Sing 38 nmod:tmod 38:nmod:tmod _ -41 to to PART TO _ 42 mark 42:mark _ -42 do do VERB VB VerbForm=Inf 38 acl 38:acl:to _ -43 research research NOUN NN Number=Sing 44 compound 44:compound _ -44 projects project NOUN NNS Number=Plur 42 obj 42:obj _ -45 on on ADP IN _ 47 case 47:case _ -46 their they PRON PRP$ Number=Plur|Person=3|Poss=Yes|PronType=Prs 47 nmod:poss 47:nmod:poss _ -47 own own ADJ JJ Degree=Pos 42 obl 42:obl:on SpaceAfter=No -48 , , PUNCT , _ 54 punct 54:punct _ -49 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 54 cop 54:cop _ -50 a a DET DT Definite=Ind|PronType=Art 54 det 54:det _ -51 good good ADJ JJ Degree=Pos 54 amod 54:amod SpaceAfter=No -52 , , PUNCT , _ 54 punct 54:punct _ -53 respectable respectable ADJ JJ Degree=Pos 54 amod 54:amod _ -54 idea idea NOUN NN Number=Sing 22 acl 22:acl:that _ -55 . . PUNCT . _ 8 punct 8:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050210075500_ENG_20050210_075500-0005 -# text = Read the entire article; there's a punchline, too. -1 Read read VERB VB Mood=Imp|VerbForm=Fin 0 root 0:root _ -2 the the DET DT Definite=Def|PronType=Art 4 det 4:det _ -3 entire entire ADJ JJ Degree=Pos 4 amod 4:amod _ -4 article article NOUN NN Number=Sing 1 obj 1:obj SpaceAfter=No -5 ; ; PUNCT , _ 1 punct 1:punct _ -6 there there PRON EX _ 7 expl 7:expl SpaceAfter=No -7 's be VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 1 parataxis 1:parataxis _ -8 a a DET DT Definite=Ind|PronType=Art 9 det 9:det _ -9 punchline punchline NOUN NN Number=Sing 7 nsubj 7:nsubj SpaceAfter=No -10 , , PUNCT , _ 7 punct 7:punct _ -11 too too ADV RB _ 7 advmod 7:advmod SpaceAfter=No -12 . . PUNCT . _ 1 punct 1:punct _ - -# newdoc id = weblog-juancole.com_juancole_20041120060600_ENG_20041120_060600 -# sent_id = weblog-juancole.com_juancole_20041120060600_ENG_20041120_060600-0001 -# text = My opinion piece on the implications of Arafat's passing for al-Qaeda has appeared at Newsday. -1 My my PRON PRP$ Number=Sing|Person=1|Poss=Yes|PronType=Prs 3 nmod:poss 3:nmod:poss _ -2 opinion opinion NOUN NN Number=Sing 3 compound 3:compound _ -3 piece piece NOUN NN Number=Sing 16 nsubj 16:nsubj _ -4 on on ADP IN _ 6 case 6:case _ -5 the the DET DT Definite=Def|PronType=Art 6 det 6:det _ -6 implications implication NOUN NNS Number=Plur 3 nmod 3:nmod:on _ -7 of of ADP IN _ 10 case 10:case _ -8 Arafat Arafat PROPN NNP Number=Sing 10 nmod:poss 10:nmod:poss SpaceAfter=No -9 's 's PART POS _ 8 case 8:case _ -10 passing passing NOUN NN Number=Sing 6 nmod 6:nmod:of _ -11 for for ADP IN _ 14 case 14:case _ -12 al al PROPN NNP Number=Sing 14 compound 14:compound SpaceAfter=No -13 - - PUNCT HYPH _ 14 punct 14:punct SpaceAfter=No -14 Qaeda Qaeda PROPN NNP Number=Sing 6 nmod 6:nmod:for _ -15 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 16 aux 16:aux _ -16 appeared appear VERB VBN Tense=Past|VerbForm=Part 0 root 0:root _ -17 at at ADP IN _ 18 case 18:case _ -18 Newsday Newsday PROPN NNP Number=Sing 16 obl 16:obl:at SpaceAfter=No -19 . . PUNCT . _ 16 punct 16:punct _ - -# sent_id = weblog-juancole.com_juancole_20041120060600_ENG_20041120_060600-0002 -# text = Excerpt: -1 Excerpt excerpt NOUN NN Number=Sing 0 root 0:root SpaceAfter=No -2 : : PUNCT : _ 1 punct 1:punct _ - -# sent_id = weblog-juancole.com_juancole_20041120060600_ENG_20041120_060600-0003 -# text = "Arafat's secular nationalism was supple enough to compromise with Israel and to imagine a two-state solution, even if the road of negotiations remained rocky. -1 " " PUNCT `` _ 7 punct 7:punct SpaceAfter=No -2 Arafat Arafat PROPN NNP Number=Sing 5 nmod:poss 5:nmod:poss SpaceAfter=No -3 's 's PART POS _ 2 case 2:case _ -4 secular secular ADJ JJ Degree=Pos 5 amod 5:amod _ -5 nationalism nationalism NOUN NN Number=Sing 7 nsubj 7:nsubj _ -6 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 7 cop 7:cop _ -7 supple supple ADJ JJ Degree=Pos 0 root 0:root _ -8 enough enough ADV RB _ 7 advmod 7:advmod _ -9 to to PART TO _ 10 mark 10:mark _ -10 compromise compromise VERB VB VerbForm=Inf 7 advcl 7:advcl:to _ -11 with with ADP IN _ 12 case 12:case _ -12 Israel Israel PROPN NNP Number=Sing 10 obl 10:obl:with _ -13 and and CCONJ CC _ 15 cc 15:cc _ -14 to to PART TO _ 15 mark 15:mark _ -15 imagine imagine VERB VB VerbForm=Inf 10 conj 7:advcl:to|10:conj:and _ -16 a a DET DT Definite=Ind|PronType=Art 20 det 20:det _ -17 two two NUM CD NumType=Card 19 nummod 19:nummod SpaceAfter=No -18 - - PUNCT HYPH _ 19 punct 19:punct SpaceAfter=No -19 state state NOUN NN Number=Sing 20 compound 20:compound _ -20 solution solution NOUN NN Number=Sing 15 obj 15:obj SpaceAfter=No -21 , , PUNCT , _ 15 punct 15:punct _ -22 even even ADV RB _ 28 advmod 28:advmod _ -23 if if SCONJ IN _ 28 mark 28:mark _ -24 the the DET DT Definite=Def|PronType=Art 25 det 25:det _ -25 road road NOUN NN Number=Sing 28 nsubj 28:nsubj|29:nsubj:xsubj _ -26 of of ADP IN _ 27 case 27:case _ -27 negotiations negotiation NOUN NNS Number=Plur 25 nmod 25:nmod:of _ -28 remained remain VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 15 advcl 15:advcl:if _ -29 rocky rocky ADJ JJ Degree=Pos 28 xcomp 28:xcomp SpaceAfter=No -30 . . PUNCT . _ 7 punct 7:punct _ - -# sent_id = weblog-juancole.com_juancole_20041120060600_ENG_20041120_060600-0004 -# text = The continued Israeli colonization of the occupied Palestinian territories during the 1990s helped, along with terrorist attacks by radical groups such as Hamas, to derail the peace process, which Sharon had always opposed. -1 The the DET DT Definite=Def|PronType=Art 4 det 4:det _ -2 continued continue VERB VBN Tense=Past|VerbForm=Part 4 amod 4:amod _ -3 Israeli israeli ADJ JJ Degree=Pos 4 amod 4:amod _ -4 colonization colonization NOUN NN Number=Sing 13 nsubj 13:nsubj|27:nsubj:xsubj _ -5 of of ADP IN _ 9 case 9:case _ -6 the the DET DT Definite=Def|PronType=Art 9 det 9:det _ -7 occupied occupy VERB VBN Tense=Past|VerbForm=Part 9 amod 9:amod _ -8 Palestinian Palestinian PROPN NNP Number=Sing 9 compound 9:compound _ -9 territories territories PROPN NNPS Number=Plur 4 nmod 4:nmod:of _ -10 during during ADP IN _ 12 case 12:case _ -11 the the DET DT Definite=Def|PronType=Art 12 det 12:det _ -12 1990s 1990 NOUN NNS Number=Plur 4 nmod 4:nmod:during _ -13 helped help VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root SpaceAfter=No -14 , , PUNCT , _ 13 punct 13:punct _ -15 along along ADP IN _ 18 case 18:case _ -16 with with ADP IN _ 18 case 18:case _ -17 terrorist terrorist ADJ JJ Degree=Pos 18 amod 18:amod _ -18 attacks attack NOUN NNS Number=Plur 13 obl 13:obl:along _ -19 by by ADP IN _ 21 case 21:case _ -20 radical radical ADJ JJ Degree=Pos 21 amod 21:amod _ -21 groups group NOUN NNS Number=Plur 18 nmod 18:nmod:by _ -22 such such ADJ JJ Degree=Pos 24 case 24:case _ -23 as as ADP IN _ 22 fixed 22:fixed _ -24 Hamas Hamas PROPN NNP Number=Sing 21 nmod 21:nmod:such_as SpaceAfter=No -25 , , PUNCT , _ 13 punct 13:punct _ -26 to to PART TO _ 27 mark 27:mark _ -27 derail derail VERB VB VerbForm=Inf 13 xcomp 13:xcomp _ -28 the the DET DT Definite=Def|PronType=Art 30 det 30:det _ -29 peace peace NOUN NN Number=Sing 30 compound 30:compound _ -30 process process NOUN NN Number=Sing 27 obj 27:obj|36:obj SpaceAfter=No -31 , , PUNCT , _ 30 punct 30:punct _ -32 which which PRON WDT PronType=Rel 36 obj 30:ref _ -33 Sharon Sharon PROPN NNP Number=Sing 36 nsubj 36:nsubj _ -34 had have AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 36 aux 36:aux _ -35 always always ADV RB _ 36 advmod 36:advmod _ -36 opposed oppose VERB VBN Tense=Past|VerbForm=Part 30 acl:relcl 30:acl:relcl SpaceAfter=No -37 . . PUNCT . _ 13 punct 13:punct _ - -# sent_id = weblog-juancole.com_juancole_20041120060600_ENG_20041120_060600-0005 -# text = Arafat's death creates a vacuum in Palestinian leadership that will not soon be filled. -1 Arafat Arafat PROPN NNP Number=Sing 3 nmod:poss 3:nmod:poss SpaceAfter=No -2 's 's PART POS _ 1 case 1:case _ -3 death death NOUN NN Number=Sing 4 nsubj 4:nsubj _ -4 creates create VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -5 a a DET DT Definite=Ind|PronType=Art 6 det 6:det _ -6 vacuum vacuum NOUN NN Number=Sing 4 obj 4:obj|15:nsubj:pass _ -7 in in ADP IN _ 9 case 9:case _ -8 Palestinian palestinian ADJ JJ Degree=Pos 9 amod 9:amod _ -9 leadership leadership NOUN NN Number=Sing 6 nmod 6:nmod:in _ -10 that that PRON WDT PronType=Rel 15 nsubj:pass 6:ref _ -11 will will AUX MD VerbForm=Fin 15 aux 15:aux _ -12 not not PART RB _ 15 advmod 15:advmod _ -13 soon soon ADV RB Degree=Pos 15 advmod 15:advmod _ -14 be be AUX VB VerbForm=Inf 15 aux:pass 15:aux:pass _ -15 filled fill VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 6 acl:relcl 6:acl:relcl SpaceAfter=No -16 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-juancole.com_juancole_20041120060600_ENG_20041120_060600-0006 -# text = Sharon's assassination of major Hamas leaders has also weakened authority structures in that party. -1 Sharon Sharon PROPN NNP Number=Sing 3 nmod:poss 3:nmod:poss SpaceAfter=No -2 's 's PART POS _ 1 case 1:case _ -3 assassination assassination NOUN NN Number=Sing 10 nsubj 10:nsubj _ -4 of of ADP IN _ 7 case 7:case _ -5 major major ADJ JJ Degree=Pos 7 amod 7:amod _ -6 Hamas Hamas PROPN NNP Number=Sing 7 compound 7:compound _ -7 leaders leader NOUN NNS Number=Plur 3 nmod 3:nmod:of _ -8 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 10 aux 10:aux _ -9 also also ADV RB _ 10 advmod 10:advmod _ -10 weakened weaken VERB VBN Tense=Past|VerbForm=Part 0 root 0:root _ -11 authority authority NOUN NN Number=Sing 12 compound 12:compound _ -12 structures structure NOUN NNS Number=Plur 10 obj 10:obj _ -13 in in ADP IN _ 15 case 15:case _ -14 that that DET DT Number=Sing|PronType=Dem 15 det 15:det _ -15 party party NOUN NN Number=Sing 12 nmod 12:nmod:in SpaceAfter=No -16 . . PUNCT . _ 10 punct 10:punct _ - -# sent_id = weblog-juancole.com_juancole_20041120060600_ENG_20041120_060600-0007 -# text = If the Israelis and the Palestinian leadership cannot find a way to reinvigorate the peace process, cells of radical young Palestinians may grow up that look to bin Laden for their cues. -1 If if SCONJ IN _ 10 mark 10:mark _ -2 the the DET DT Definite=Def|PronType=Art 3 det 3:det _ -3 Israelis Israelis PROPN NNPS Number=Plur 10 nsubj 10:nsubj _ -4 and and CCONJ CC _ 7 cc 7:cc _ -5 the the DET DT Definite=Def|PronType=Art 7 det 7:det _ -6 Palestinian palestinian ADJ JJ Degree=Pos 7 amod 7:amod _ -7 leadership leadership NOUN NN Number=Sing 3 conj 3:conj:and|10:nsubj _ -8 can can AUX MD VerbForm=Fin 10 aux 10:aux SpaceAfter=No -9 not not PART RB _ 10 advmod 10:advmod _ -10 find find VERB VB VerbForm=Inf 25 advcl 25:advcl:if _ -11 a a DET DT Definite=Ind|PronType=Art 12 det 12:det _ -12 way way NOUN NN Number=Sing 10 obj 10:obj _ -13 to to PART TO _ 14 mark 14:mark _ -14 reinvigorate reinvigorate VERB VB VerbForm=Inf 12 acl 12:acl:to _ -15 the the DET DT Definite=Def|PronType=Art 17 det 17:det _ -16 peace peace NOUN NN Number=Sing 17 compound 17:compound _ -17 process process NOUN NN Number=Sing 14 obj 14:obj SpaceAfter=No -18 , , PUNCT , _ 25 punct 25:punct _ -19 cells cell NOUN NNS Number=Plur 25 nsubj 25:nsubj _ -20 of of ADP IN _ 23 case 23:case _ -21 radical radical ADJ JJ Degree=Pos 23 amod 23:amod _ -22 young young ADJ JJ Degree=Pos 23 amod 23:amod _ -23 Palestinians Palestinians PROPN NNPS Number=Plur 19 nmod 19:nmod:of|28:nsubj _ -24 may may AUX MD VerbForm=Fin 25 aux 25:aux _ -25 grow grow VERB VB VerbForm=Inf 0 root 0:root _ -26 up up ADP RP _ 25 compound:prt 25:compound:prt _ -27 that that PRON WDT PronType=Rel 28 nsubj 23:ref _ -28 look look VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 23 acl:relcl 23:acl:relcl _ -29 to to ADP IN _ 30 case 30:case _ -30 bin bin PROPN NNP Number=Sing 28 obl 28:obl:to _ -31 Laden Laden PROPN NNP Number=Sing 30 flat 30:flat _ -32 for for ADP IN _ 34 case 34:case _ -33 their they PRON PRP$ Number=Plur|Person=3|Poss=Yes|PronType=Prs 34 nmod:poss 34:nmod:poss _ -34 cues cue NOUN NNS Number=Plur 28 obl 28:obl:for SpaceAfter=No -35 . . PUNCT . _ 25 punct 25:punct _ - -# sent_id = weblog-juancole.com_juancole_20041120060600_ENG_20041120_060600-0008 -# text = Even if local Palestinian leaders remain strong enough to keep al-Qaida out, the festering Israeli-Palestinian struggle remains among the best recruiting posters for al-Qaida with young Muslim men. -1 Even even ADV RB _ 6 advmod 6:advmod _ -2 if if SCONJ IN _ 6 mark 6:mark _ -3 local local ADJ JJ Degree=Pos 5 amod 5:amod _ -4 Palestinian palestinian ADJ JJ Degree=Pos 5 amod 5:amod _ -5 leaders leader NOUN NNS Number=Plur 6 nsubj 6:nsubj|8:nsubj:xsubj _ -6 remain remain VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 22 advcl 22:advcl:if _ -7 strong strong ADJ JJ Degree=Pos 8 amod 8:amod _ -8 enough enough ADJ JJ Degree=Pos 6 xcomp 6:xcomp _ -9 to to PART TO _ 10 mark 10:mark _ -10 keep keep VERB VB VerbForm=Inf 7 advcl 7:advcl:to _ -11 al al PROPN NNP Number=Sing 13 compound 13:compound SpaceAfter=No -12 - - PUNCT HYPH _ 13 punct 13:punct SpaceAfter=No -13 Qaida Qaida PROPN NNP Number=Sing 10 obj 10:obj|14:nsubj:xsubj _ -14 out out ADV RB _ 10 xcomp 10:xcomp SpaceAfter=No -15 , , PUNCT , _ 22 punct 22:punct _ -16 the the DET DT Definite=Def|PronType=Art 21 det 21:det _ -17 festering fester VERB VBG VerbForm=Ger 21 amod 21:amod _ -18 Israeli israeli ADJ JJ Degree=Pos 20 amod 20:amod SpaceAfter=No -19 - - PUNCT HYPH _ 20 punct 20:punct SpaceAfter=No -20 Palestinian palestinian ADJ JJ Degree=Pos 21 amod 21:amod _ -21 struggle struggle NOUN NN Number=Sing 22 nsubj 22:nsubj _ -22 remains remain VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -23 among among ADP IN _ 27 case 27:case _ -24 the the DET DT Definite=Def|PronType=Art 27 det 27:det _ -25 best best ADJ JJS Degree=Sup 27 amod 27:amod _ -26 recruiting recruiting NOUN NN Number=Sing 27 compound 27:compound _ -27 posters poster NOUN NNS Number=Plur 22 obl 22:obl:among _ -28 for for ADP IN _ 31 case 31:case _ -29 al al PROPN NNP Number=Sing 31 compound 31:compound SpaceAfter=No -30 - - PUNCT HYPH _ 31 punct 31:punct SpaceAfter=No -31 Qaida Qaida PROPN NNP Number=Sing 27 nmod 27:nmod:for _ -32 with with ADP IN _ 35 case 35:case _ -33 young young ADJ JJ Degree=Pos 35 amod 35:amod _ -34 Muslim muslim ADJ JJ Degree=Pos 35 amod 35:amod _ -35 men man NOUN NNS Number=Plur 22 obl 22:obl:with SpaceAfter=No -36 . . PUNCT . _ 22 punct 22:punct _ - -# sent_id = weblog-juancole.com_juancole_20041120060600_ENG_20041120_060600-0009 -# text = Resolving this conflict would be the most effective weapon the United States could deploy in its war on terror." -1 Resolving resolve VERB VBG VerbForm=Ger 9 csubj 9:csubj _ -2 this this DET DT Number=Sing|PronType=Dem 3 det 3:det _ -3 conflict conflict NOUN NN Number=Sing 1 obj 1:obj _ -4 would would AUX MD VerbForm=Fin 9 aux 9:aux _ -5 be be AUX VB VerbForm=Inf 9 cop 9:cop _ -6 the the DET DT Definite=Def|PronType=Art 9 det 9:det _ -7 most most ADV RBS _ 8 advmod 8:advmod _ -8 effective effective ADJ JJ Degree=Pos 9 amod 9:amod _ -9 weapon weapon NOUN NN Number=Sing 0 root 0:root _ -10 the the DET DT Definite=Def|PronType=Art 12 det 12:det _ -11 United United PROPN NNP Number=Sing 12 compound 12:compound _ -12 States States PROPN NNP Number=Sing 14 nsubj 14:nsubj _ -13 could could AUX MD VerbForm=Fin 14 aux 14:aux _ -14 deploy deploy VERB VB VerbForm=Inf 9 acl:relcl 9:acl:relcl _ -15 in in ADP IN _ 17 case 17:case _ -16 its its PRON PRP$ Gender=Neut|Number=Sing|Person=3|Poss=Yes|PronType=Prs 17 nmod:poss 17:nmod:poss _ -17 war war NOUN NN Number=Sing 14 obl 14:obl:in _ -18 on on ADP IN _ 19 case 19:case _ -19 terror terror NOUN NN Number=Sing 17 nmod 17:nmod:on SpaceAfter=No -20 . . PUNCT . _ 9 punct 9:punct SpaceAfter=No -21 " " PUNCT '' _ 9 punct 9:punct _ - diff --git a/tests/test_corpus/en_ewt-ud-dev.txt b/tests/test_corpus/en_ewt-ud-dev.txt deleted file mode 100644 index 79e9ae8f7..000000000 --- a/tests/test_corpus/en_ewt-ud-dev.txt +++ /dev/null @@ -1,52 +0,0 @@ -From the AP comes this story : President Bush on Tuesday nominated two -individuals to replace retiring jurists on federal courts in the Washington -area. Bush nominated Jennifer M. Anderson for a 15-year term as associate judge -of the Superior Court of the District of Columbia, replacing Steffen W. Graae. -*** Bush also nominated A. Noel Anketell Kramer for a 15-year term as associate -judge of the District of Columbia Court of Appeals, replacing John Montague -Steadman. - -The sheikh in wheel-chair has been attacked with a F-16-launched bomb. He could -be killed years ago and the israelians have all the reasons, since he founded -and he is the spiritual leader of Hamas, but they didn't. Today's incident -proves that Sharon has lost his patience and his hope in peace. Nervous people -make mistakes, so I suppose there will be a wave of succesfull arab attacks. A -la guerre c'est comme a la guerre! - -In the eastern city of Baqubah, guerrillas detonated a car bomb outside a police -station, killing several people. The US lost yet another helicopter to hostile -fire near Habbaniyah in the Sunni heartland, but this time the crew was safe. In -Fallujah, hundreds of demonstrators came out against US troops when they briefly -arrested a yound newlywed bride. (I hope that the US army got an enormous amount -of information from her relatives, because otherwise this move was a bad, bad -tradeoff). The US troops fired into the hostile crowd, killing 4. It seems clear -to me that the manhunt for high Baath officials in the Sunni heartland is being -done wrong, or at least in ways that are bad for US standing with local Iraqis. - -Google has finally had an analyst day -- a chance to present the company's story -to the (miniscule number of) people who haven't heard it. Usually, these are -just a chance for the suckups to suck up, but this time people are actually -concerned about the company's plans. They work on Wall Street, after all, so -when they hear a company who's stated goals include "Don't be evil," they -imagine a company who's eventually history will be "Don't be profitable." It's -not quite as freewheeling an environment as you'd imagine: Sergey Brin has -actually created a mathematical 'proof' that the company's self-driven research -strategy, which gives employees one day a week to do research projects on their -own, is a good, respectable idea . Read the entire article; there's a punchline, -too. - -My opinion piece on the implications of Arafat's passing for al-Qaeda has -appeared at Newsday. Excerpt: "Arafat's secular nationalism was supple enough to -compromise with Israel and to imagine a two-state solution, even if the road of -negotiations remained rocky. The continued Israeli colonization of the occupied -Palestinian territories during the 1990s helped, along with terrorist attacks by -radical groups such as Hamas, to derail the peace process, which Sharon had -always opposed. Arafat's death creates a vacuum in Palestinian leadership that -will not soon be filled. Sharon's assassination of major Hamas leaders has also -weakened authority structures in that party. If the Israelis and the Palestinian -leadership cannot find a way to reinvigorate the peace process, cells of radical -young Palestinians may grow up that look to bin Laden for their cues. Even if -local Palestinian leaders remain strong enough to keep al-Qaida out, the -festering Israeli-Palestinian struggle remains among the best recruiting posters -for al-Qaida with young Muslim men. Resolving this conflict would be the most -effective weapon the United States could deploy in its war on terror." diff --git a/tests/test_corpus/en_ewt-ud-test.conllu b/tests/test_corpus/en_ewt-ud-test.conllu deleted file mode 100644 index 4ebb1a522..000000000 --- a/tests/test_corpus/en_ewt-ud-test.conllu +++ /dev/null @@ -1,596 +0,0 @@ -# newdoc id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200 -# sent_id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-0001 -# text = What if Google Morphed Into GoogleOS? -1 What what PRON WP PronType=Int 0 root 0:root _ -2 if if SCONJ IN _ 4 mark 4:mark _ -3 Google Google PROPN NNP Number=Sing 4 nsubj 4:nsubj _ -4 Morphed morph VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 advcl 1:advcl:if _ -5 Into into ADP IN _ 6 case 6:case _ -6 GoogleOS GoogleOS PROPN NNP Number=Sing 4 obl 4:obl:into SpaceAfter=No -7 ? ? PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-0002 -# text = What if Google expanded on its search-engine (and now e-mail) wares into a full-fledged operating system? -1 What what PRON WP PronType=Int 0 root 0:root _ -2 if if SCONJ IN _ 4 mark 4:mark _ -3 Google Google PROPN NNP Number=Sing 4 nsubj 4:nsubj _ -4 expanded expand VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 advcl 1:advcl:if _ -5 on on ADP IN _ 15 case 15:case _ -6 its its PRON PRP$ Gender=Neut|Number=Sing|Person=3|Poss=Yes|PronType=Prs 15 nmod:poss 15:nmod:poss _ -7 search search NOUN NN Number=Sing 9 compound 9:compound SpaceAfter=No -8 - - PUNCT HYPH _ 9 punct 9:punct SpaceAfter=No -9 engine engine NOUN NN Number=Sing 15 compound 15:compound _ -10 ( ( PUNCT -LRB- _ 9 punct 9:punct SpaceAfter=No -11 and and CCONJ CC _ 13 cc 13:cc _ -12 now now ADV RB _ 13 advmod 13:advmod _ -13 e-mail e-mail NOUN NN Number=Sing 9 conj 9:conj:and|15:compound SpaceAfter=No -14 ) ) PUNCT -RRB- _ 15 punct 15:punct _ -15 wares wares NOUN NNS Number=Plur 4 obl 4:obl:on _ -16 into into ADP IN _ 22 case 22:case _ -17 a a DET DT Definite=Ind|PronType=Art 22 det 22:det _ -18 full full ADV RB _ 20 advmod 20:advmod SpaceAfter=No -19 - - PUNCT HYPH _ 20 punct 20:punct SpaceAfter=No -20 fledged fledged ADJ JJ Degree=Pos 22 amod 22:amod _ -21 operating operating NOUN NN Number=Sing 22 compound 22:compound _ -22 system system NOUN NN Number=Sing 4 obl 4:obl:into SpaceAfter=No -23 ? ? PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-0003 -# text = [via Microsoft Watch from Mary Jo Foley ] -1 [ [ PUNCT -LRB- _ 4 punct 4:punct SpaceAfter=No -2 via via ADP IN _ 4 case 4:case _ -3 Microsoft Microsoft PROPN NNP Number=Sing 4 compound 4:compound _ -4 Watch Watch PROPN NNP Number=Sing 0 root 0:root _ -5 from from ADP IN _ 6 case 6:case _ -6 Mary Mary PROPN NNP Number=Sing 4 nmod 4:nmod:from _ -7 Jo Jo PROPN NNP Number=Sing 6 flat 6:flat _ -8 Foley Foley PROPN NNP Number=Sing 6 flat 6:flat _ -9 ] ] PUNCT -RRB- _ 4 punct 4:punct _ - -# newdoc id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700 -# sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0001 -# text = (And, by the way, is anybody else just a little nostalgic for the days when that was a good thing?) -1 ( ( PUNCT -LRB- _ 14 punct 14:punct SpaceAfter=No -2 And and CCONJ CC _ 14 cc 14:cc SpaceAfter=No -3 , , PUNCT , _ 14 punct 14:punct _ -4 by by ADP IN _ 6 case 6:case _ -5 the the DET DT Definite=Def|PronType=Art 6 det 6:det _ -6 way way NOUN NN Number=Sing 14 obl 14:obl:by SpaceAfter=No -7 , , PUNCT , _ 14 punct 14:punct _ -8 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 14 cop 14:cop _ -9 anybody anybody PRON NN Number=Sing 14 nsubj 14:nsubj _ -10 else else ADJ JJ Degree=Pos 9 amod 9:amod _ -11 just just ADV RB _ 13 advmod 13:advmod _ -12 a a DET DT Definite=Ind|PronType=Art 13 det 13:det _ -13 little little ADJ JJ Degree=Pos 14 obl:npmod 14:obl:npmod _ -14 nostalgic nostalgic NOUN NN Number=Sing 0 root 0:root _ -15 for for ADP IN _ 17 case 17:case _ -16 the the DET DT Definite=Def|PronType=Art 17 det 17:det _ -17 days day NOUN NNS Number=Plur 14 nmod 14:nmod:for|23:nsubj _ -18 when when ADV WRB PronType=Rel 23 advmod 23:advmod _ -19 that that PRON DT Number=Sing|PronType=Dem 23 nsubj 17:ref _ -20 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 23 cop 23:cop _ -21 a a DET DT Definite=Ind|PronType=Art 23 det 23:det _ -22 good good ADJ JJ Degree=Pos 23 amod 23:amod _ -23 thing thing NOUN NN Number=Sing 17 acl:relcl 17:acl:relcl SpaceAfter=No -24 ? ? PUNCT . _ 14 punct 14:punct SpaceAfter=No -25 ) ) PUNCT -RRB- _ 14 punct 14:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0002 -# text = This BuzzMachine post argues that Google's rush toward ubiquity might backfire -- which we've all heard before, but it's particularly well-put in this post. -1 This this DET DT Number=Sing|PronType=Dem 3 det 3:det _ -2 BuzzMachine BuzzMachine PROPN NNP Number=Sing 3 compound 3:compound _ -3 post post NOUN NN Number=Sing 4 nsubj 4:nsubj _ -4 argues argue VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -5 that that SCONJ IN _ 12 mark 12:mark _ -6 Google Google PROPN NNP Number=Sing 8 nmod:poss 8:nmod:poss SpaceAfter=No -7 's 's PART POS _ 6 case 6:case _ -8 rush rush NOUN NN Number=Sing 12 nsubj 12:nsubj _ -9 toward toward ADP IN _ 10 case 10:case _ -10 ubiquity ubiquity NOUN NN Number=Sing 8 nmod 8:nmod:toward _ -11 might might AUX MD VerbForm=Fin 12 aux 12:aux _ -12 backfire backfire VERB VB VerbForm=Inf 4 ccomp 4:ccomp|18:obj _ -13 -- -- PUNCT , _ 12 punct 12:punct _ -14 which which PRON WDT PronType=Rel 18 obj 12:ref _ -15 we we PRON PRP Case=Nom|Number=Plur|Person=1|PronType=Prs 18 nsubj 18:nsubj SpaceAfter=No -16 've have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 18 aux 18:aux _ -17 all all ADV RB _ 18 advmod 18:advmod _ -18 heard hear VERB VBN Tense=Past|VerbForm=Part 12 acl:relcl 12:acl:relcl _ -19 before before ADV RB _ 18 advmod 18:advmod SpaceAfter=No -20 , , PUNCT , _ 27 punct 27:punct _ -21 but but CCONJ CC _ 27 cc 27:cc _ -22 it it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 27 nsubj:pass 27:nsubj:pass SpaceAfter=No -23 's be VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 27 aux:pass 27:aux:pass _ -24 particularly particularly ADV RB _ 27 advmod 27:advmod _ -25 well well ADV RB Degree=Pos 27 advmod 27:advmod SpaceAfter=No -26 - - PUNCT HYPH _ 27 punct 27:punct SpaceAfter=No -27 put put VERB VBN Tense=Past|VerbForm=Part 4 conj 4:conj:but _ -28 in in ADP IN _ 30 case 30:case _ -29 this this DET DT Number=Sing|PronType=Dem 30 det 30:det _ -30 post post NOUN NN Number=Sing 27 obl 27:obl:in SpaceAfter=No -31 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0003 -# text = Google is a nice search engine. -1 Google Google PROPN NNP Number=Sing 6 nsubj 6:nsubj _ -2 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 cop 6:cop _ -3 a a DET DT Definite=Ind|PronType=Art 6 det 6:det _ -4 nice nice ADJ JJ Degree=Pos 6 amod 6:amod _ -5 search search NOUN NN Number=Sing 6 compound 6:compound _ -6 engine engine NOUN NN Number=Sing 0 root 0:root SpaceAfter=No -7 . . PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0004 -# text = Does anybody use it for anything else? -1 Does do AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 aux 3:aux _ -2 anybody anybody PRON NN Number=Sing 3 nsubj 3:nsubj _ -3 use use VERB VB VerbForm=Inf 0 root 0:root _ -4 it it PRON PRP Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs 3 obj 3:obj _ -5 for for ADP IN _ 6 case 6:case _ -6 anything anything PRON NN Number=Sing 3 obl 3:obl:for _ -7 else else ADJ JJ Degree=Pos 6 amod 6:amod SpaceAfter=No -8 ? ? PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0005 -# text = They own blogger, of course. -1 They they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 2 nsubj 2:nsubj _ -2 own own VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 0 root 0:root _ -3 blogger blogger PROPN NNP Number=Sing 2 obj 2:obj SpaceAfter=No -4 , , PUNCT , _ 2 punct 2:punct _ -5 of of ADV RB _ 2 advmod 2:advmod _ -6 course course ADV RB _ 5 fixed 5:fixed SpaceAfter=No -7 . . PUNCT . _ 2 punct 2:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0006 -# text = Is that a money maker? -1 Is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 5 cop 5:cop _ -2 that that PRON DT Number=Sing|PronType=Dem 5 nsubj 5:nsubj _ -3 a a DET DT Definite=Ind|PronType=Art 5 det 5:det _ -4 money money NOUN NN Number=Sing 5 compound 5:compound _ -5 maker maker NOUN NN Number=Sing 0 root 0:root SpaceAfter=No -6 ? ? PUNCT . _ 5 punct 5:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0007 -# text = I'm staying away from the stock. -1 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 3 nsubj 3:nsubj SpaceAfter=No -2 'm be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 3 aux 3:aux _ -3 staying stay VERB VBG Tense=Pres|VerbForm=Part 0 root 0:root _ -4 away away ADV RB _ 3 advmod 3:advmod _ -5 from from ADP IN _ 7 case 7:case _ -6 the the DET DT Definite=Def|PronType=Art 7 det 7:det _ -7 stock stock NOUN NN Number=Sing 4 obl 4:obl:from SpaceAfter=No -8 . . PUNCT . _ 3 punct 3:punct _ - -# newdoc id = weblog-blogspot.com_floppingaces_20041126180010_ENG_20041126_180010 -# sent_id = weblog-blogspot.com_floppingaces_20041126180010_ENG_20041126_180010-0001 -# text = I doubt the very few who actually read my blog have not come across this yet, but I figured I would put it out there anyways. -1 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 2 nsubj 2:nsubj _ -2 doubt doubt VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 0 root 0:root _ -3 the the DET DT Definite=Def|PronType=Art 5 det 5:det _ -4 very very ADV RB _ 5 advmod 5:advmod _ -5 few few ADJ JJ Degree=Pos 13 nsubj 8:nsubj|13:nsubj _ -6 who who PRON WP PronType=Rel 8 nsubj 5:ref _ -7 actually actually ADV RB _ 8 advmod 8:advmod _ -8 read read VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 5 acl:relcl 5:acl:relcl _ -9 my my PRON PRP$ Number=Sing|Person=1|Poss=Yes|PronType=Prs 10 nmod:poss 10:nmod:poss _ -10 blog blog NOUN NN Number=Sing 8 obj 8:obj _ -11 have have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 13 aux 13:aux _ -12 not not PART RB _ 13 advmod 13:advmod _ -13 come come VERB VBN Tense=Past|VerbForm=Part 2 ccomp 2:ccomp _ -14 across across ADP IN _ 15 case 15:case _ -15 this this PRON DT Number=Sing|PronType=Dem 13 obl 13:obl:across _ -16 yet yet ADV RB _ 13 advmod 13:advmod SpaceAfter=No -17 , , PUNCT , _ 20 punct 20:punct _ -18 but but CCONJ CC _ 20 cc 20:cc _ -19 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 20 nsubj 20:nsubj _ -20 figured figure VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 2 conj 2:conj:but _ -21 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 23 nsubj 23:nsubj _ -22 would would AUX MD VerbForm=Fin 23 aux 23:aux _ -23 put put VERB VB VerbForm=Inf 20 ccomp 20:ccomp _ -24 it it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 23 obj 23:obj _ -25 out out ADV RB _ 26 advmod 26:advmod _ -26 there there ADV RB PronType=Dem 23 advmod 23:advmod _ -27 anyways anyways ADV RB _ 23 advmod 23:advmod SpaceAfter=No -28 . . PUNCT . _ 2 punct 2:punct _ - -# sent_id = weblog-blogspot.com_floppingaces_20041126180010_ENG_20041126_180010-0002 -# text = John Donovan from Argghhh! has put out a excellent slide show on what was actually found and fought for in Fallujah. -1 John John PROPN NNP Number=Sing 6 nsubj 6:nsubj _ -2 Donovan Donovan PROPN NNP Number=Sing 1 flat 1:flat _ -3 from from ADP IN _ 4 case 4:case _ -4 Argghhh! Argghhh! PROPN NNP Number=Sing 1 nmod 1:nmod:from _ -5 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 aux 6:aux _ -6 put put VERB VBN Tense=Past|VerbForm=Part 0 root 0:root _ -7 out out ADP RP _ 6 compound:prt 6:compound:prt _ -8 a a DET DT Definite=Ind|PronType=Art 11 det 11:det _ -9 excellent excellent ADJ JJ Degree=Pos 11 amod 11:amod _ -10 slide slide NOUN NN Number=Sing 11 compound 11:compound _ -11 show show NOUN NN Number=Sing 6 obj 6:obj _ -12 on on SCONJ IN _ 13 case 13:case _ -13 what what PRON WP PronType=Int 11 nmod 11:nmod:on _ -14 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 16 aux:pass 16:aux:pass _ -15 actually actually ADV RB _ 16 advmod 16:advmod _ -16 found find VERB VBN Tense=Past|VerbForm=Part 13 acl:relcl 13:acl:relcl _ -17 and and CCONJ CC _ 18 cc 18:cc _ -18 fought fight VERB VBN Tense=Past|VerbForm=Part 16 conj 13:acl:relcl|16:conj:and _ -19 for for ADP IN _ 18 obl 18:obl _ -20 in in ADP IN _ 21 case 21:case _ -21 Fallujah Fallujah PROPN NNP Number=Sing 16 obl 16:obl:in SpaceAfter=No -22 . . PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-blogspot.com_floppingaces_20041126180010_ENG_20041126_180010-0003 -# text = Click here To view it. -1 Click click VERB VB Mood=Imp|VerbForm=Fin 0 root 0:root _ -2 here here ADV RB PronType=Dem 1 advmod 1:advmod _ -3 To to PART TO _ 4 mark 4:mark _ -4 view view VERB VB VerbForm=Inf 1 advcl 1:advcl:to _ -5 it it PRON PRP Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs 4 obj 4:obj SpaceAfter=No -6 . . PUNCT . _ 1 punct 1:punct _ - -# sent_id = weblog-blogspot.com_floppingaces_20041126180010_ENG_20041126_180010-0004 -# text = He makes some good observations on a few of the pic's. -1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 2 nsubj 2:nsubj _ -2 makes make VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -3 some some DET DT _ 5 det 5:det _ -4 good good ADJ JJ Degree=Pos 5 amod 5:amod _ -5 observations observation NOUN NNS Number=Plur 2 obj 2:obj _ -6 on on ADP IN _ 8 case 8:case _ -7 a a DET DT Definite=Ind|PronType=Art 8 det 8:det _ -8 few few ADJ JJ Degree=Pos 5 nmod 5:nmod:on _ -9 of of ADP IN _ 11 case 11:case _ -10 the the DET DT Definite=Def|PronType=Art 11 det 11:det _ -11 pic's pic' NOUN NNS Number=Plur 8 obl 8:obl:of SpaceAfter=No -12 . . PUNCT . _ 2 punct 2:punct _ - -# sent_id = weblog-blogspot.com_floppingaces_20041126180010_ENG_20041126_180010-0005 -# text = One of the pictures shows a flag that was found in Fallujah. -1 One one NUM CD NumType=Card 5 nsubj 5:nsubj _ -2 of of ADP IN _ 4 case 4:case _ -3 the the DET DT Definite=Def|PronType=Art 4 det 4:det _ -4 pictures picture NOUN NNS Number=Plur 1 nmod 1:nmod:of _ -5 shows show VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -6 a a DET DT Definite=Ind|PronType=Art 7 det 7:det _ -7 flag flag NOUN NN Number=Sing 5 obj 5:obj|10:nsubj:pass _ -8 that that PRON WDT PronType=Rel 10 nsubj:pass 7:ref _ -9 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 10 aux:pass 10:aux:pass _ -10 found find VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 7 acl:relcl 7:acl:relcl _ -11 in in ADP IN _ 12 case 12:case _ -12 Fallujah Fallujah PROPN NNP Number=Sing 10 obl 10:obl:in SpaceAfter=No -13 . . PUNCT . _ 5 punct 5:punct _ - -# sent_id = weblog-blogspot.com_floppingaces_20041126180010_ENG_20041126_180010-0006 -# text = On the next two pictures he took screenshots of two beheading video's. -1 On on ADP IN _ 5 case 5:case _ -2 the the DET DT Definite=Def|PronType=Art 5 det 5:det _ -3 next next ADJ JJ Degree=Pos 5 amod 5:amod _ -4 two two NUM CD NumType=Card 5 nummod 5:nummod _ -5 pictures picture NOUN NNS Number=Plur 7 obl 7:obl:on _ -6 he he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 7 nsubj 7:nsubj _ -7 took take VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -8 screenshots screenshot NOUN NNS Number=Plur 7 obj 7:obj _ -9 of of ADP IN _ 12 case 12:case _ -10 two two NUM CD NumType=Card 12 nummod 12:nummod _ -11 beheading beheading NOUN NN Number=Sing 12 compound 12:compound _ -12 video's video' NOUN NNS Number=Plur 8 nmod 8:nmod:of SpaceAfter=No -13 . . PUNCT . _ 7 punct 7:punct _ - -# sent_id = weblog-blogspot.com_floppingaces_20041126180010_ENG_20041126_180010-0007 -# text = Compare the flags to the Fallujah one. -1 Compare compare VERB VB Mood=Imp|VerbForm=Fin 0 root 0:root _ -2 the the DET DT Definite=Def|PronType=Art 3 det 3:det _ -3 flags flag NOUN NNS Number=Plur 1 obj 1:obj _ -4 to to ADP IN _ 7 case 7:case _ -5 the the DET DT Definite=Def|PronType=Art 7 det 7:det _ -6 Fallujah Fallujah PROPN NNP Number=Sing 7 compound 7:compound _ -7 one one NOUN NN Number=Sing 1 obl 1:obl:to SpaceAfter=No -8 . . PUNCT . _ 1 punct 1:punct _ - -# sent_id = weblog-blogspot.com_floppingaces_20041126180010_ENG_20041126_180010-0008 -# text = You have to see these slides....they are amazing. -1 You you PRON PRP Case=Nom|Person=2|PronType=Prs 2 nsubj 2:nsubj|4:nsubj:xsubj _ -2 have have VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 0 root 0:root _ -3 to to PART TO _ 4 mark 4:mark _ -4 see see VERB VB VerbForm=Inf 2 xcomp 2:xcomp _ -5 these these DET DT Number=Plur|PronType=Dem 6 det 6:det _ -6 slides slide NOUN NNS Number=Plur 4 obj 4:obj SpaceAfter=No -7 .... .... PUNCT , _ 2 punct 2:punct SpaceAfter=No -8 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 10 nsubj 10:nsubj _ -9 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 10 cop 10:cop _ -10 amazing amazing ADJ JJ Degree=Pos 2 ccomp 2:ccomp SpaceAfter=No -11 . . PUNCT . _ 2 punct 2:punct _ - -# sent_id = weblog-blogspot.com_floppingaces_20041126180010_ENG_20041126_180010-0009 -# text = This Fallujah operation my turn out to be the most important operation done by the US Military since the end of the war. -1 This this DET DT Number=Sing|PronType=Dem 3 det 3:det _ -2 Fallujah Fallujah PROPN NNP Number=Sing 3 compound 3:compound _ -3 operation operation NOUN NN Number=Sing 5 nsubj 5:nsubj|12:nsubj:xsubj _ -4 my may AUX MD Typo=Yes|VerbForm=Fin 5 aux 5:aux _ -5 turn turn VERB VB VerbForm=Inf 0 root 0:root _ -6 out out ADP RP _ 5 compound:prt 5:compound:prt _ -7 to to PART TO _ 12 mark 12:mark _ -8 be be AUX VB VerbForm=Inf 12 cop 12:cop _ -9 the the DET DT Definite=Def|PronType=Art 12 det 12:det _ -10 most most ADV RBS _ 11 advmod 11:advmod _ -11 important important ADJ JJ Degree=Pos 12 amod 12:amod _ -12 operation operation NOUN NN Number=Sing 5 xcomp 5:xcomp _ -13 done do VERB VBN Tense=Past|VerbForm=Part 12 advcl 12:advcl _ -14 by by ADP IN _ 17 case 17:case _ -15 the the DET DT Definite=Def|PronType=Art 17 det 17:det _ -16 US US PROPN NNP Number=Sing 17 compound 17:compound _ -17 Military military NOUN NN Number=Sing 13 obl 13:obl:by _ -18 since since ADP IN _ 20 case 20:case _ -19 the the DET DT Definite=Def|PronType=Art 20 det 20:det _ -20 end end NOUN NN Number=Sing 13 obl 13:obl:since _ -21 of of ADP IN _ 23 case 23:case _ -22 the the DET DT Definite=Def|PronType=Art 23 det 23:det _ -23 war war NOUN NN Number=Sing 20 nmod 20:nmod:of SpaceAfter=No -24 . . PUNCT . _ 5 punct 5:punct _ - -# newdoc id = weblog-blogspot.com_marketview_20050224181500_ENG_20050224_181500 -# sent_id = weblog-blogspot.com_marketview_20050224181500_ENG_20050224_181500-0001 -# text = Let me join the chorus of annoyance over Google's new toolbar , which, as noted in the linked article, commits just about every sin an online marketer could commit, and makes up a few new ones besides. -1 Let let VERB VB Mood=Imp|VerbForm=Fin 0 root 0:root _ -2 me I PRON PRP Case=Acc|Number=Sing|Person=1|PronType=Prs 1 obj 1:obj|3:nsubj:xsubj _ -3 join join VERB VB VerbForm=Inf 1 xcomp 1:xcomp _ -4 the the DET DT Definite=Def|PronType=Art 5 det 5:det _ -5 chorus chorus NOUN NN Number=Sing 3 obj 3:obj _ -6 of of ADP IN _ 7 case 7:case _ -7 annoyance annoyance NOUN NN Number=Sing 5 nmod 5:nmod:of _ -8 over over ADP IN _ 12 case 12:case _ -9 Google Google PROPN NNP Number=Sing 12 nmod:poss 12:nmod:poss SpaceAfter=No -10 's 's PART POS _ 9 case 9:case _ -11 new new ADJ JJ Degree=Pos 12 amod 12:amod _ -12 toolbar toolbar NOUN NN Number=Sing 7 nmod 7:nmod:over|23:nsubj|35:nsubj _ -13 , , PUNCT , _ 12 punct 12:punct _ -14 which which PRON WDT PronType=Rel 23 nsubj 12:ref SpaceAfter=No -15 , , PUNCT , _ 23 punct 23:punct _ -16 as as SCONJ IN _ 17 mark 17:mark _ -17 noted note VERB VBN Tense=Past|VerbForm=Part 23 advcl 23:advcl:as _ -18 in in ADP IN _ 21 case 21:case _ -19 the the DET DT Definite=Def|PronType=Art 21 det 21:det _ -20 linked link VERB VBN Tense=Past|VerbForm=Part 21 amod 21:amod _ -21 article article NOUN NN Number=Sing 17 obl 17:obl:in SpaceAfter=No -22 , , PUNCT , _ 23 punct 23:punct _ -23 commits commit VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 12 acl:relcl 12:acl:relcl _ -24 just just ADV RB _ 25 advmod 25:advmod _ -25 about about ADV RB _ 27 advmod 27:advmod _ -26 every every DET DT _ 27 det 27:det _ -27 sin sin NOUN NN Number=Sing 23 obj 23:obj _ -28 an a DET DT Definite=Ind|PronType=Art 30 det 30:det _ -29 online online ADJ JJ Degree=Pos 30 amod 30:amod _ -30 marketer marketer NOUN NN Number=Sing 32 nsubj 32:nsubj _ -31 could could AUX MD VerbForm=Fin 32 aux 32:aux _ -32 commit commit VERB VB VerbForm=Inf 27 acl:relcl 27:acl:relcl SpaceAfter=No -33 , , PUNCT , _ 35 punct 35:punct _ -34 and and CCONJ CC _ 35 cc 35:cc _ -35 makes make VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 23 conj 12:acl:relcl|23:conj:and _ -36 up up ADP RP _ 35 compound:prt 35:compound:prt _ -37 a a DET DT Definite=Ind|PronType=Art 40 det 40:det _ -38 few few ADJ JJ Degree=Pos 40 amod 40:amod _ -39 new new ADJ JJ Degree=Pos 40 amod 40:amod _ -40 ones one NOUN NNS Number=Plur 35 obj 35:obj _ -41 besides besides ADV RB _ 35 advmod 35:advmod SpaceAfter=No -42 . . PUNCT . _ 1 punct 1:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050224181500_ENG_20050224_181500-0002 -# text = I'm not fond of the Google-hates-privacy argument -1 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 4 nsubj 4:nsubj SpaceAfter=No -2 'm be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 4 cop 4:cop _ -3 not not PART RB _ 4 advmod 4:advmod _ -4 fond fond ADJ JJ Degree=Pos 0 root 0:root _ -5 of of ADP IN _ 12 case 12:case _ -6 the the DET DT Definite=Def|PronType=Art 12 det 12:det _ -7 Google Google PROPN NNP Number=Sing 9 nsubj 9:nsubj SpaceAfter=No -8 - - PUNCT HYPH _ 9 punct 9:punct SpaceAfter=No -9 hates hate VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 12 compound 12:compound SpaceAfter=No -10 - - PUNCT HYPH _ 9 punct 9:punct SpaceAfter=No -11 privacy privacy NOUN NN Number=Sing 9 obj 9:obj _ -12 argument argument NOUN NN Number=Sing 4 obl 4:obl:of _ - -# sent_id = weblog-blogspot.com_marketview_20050224181500_ENG_20050224_181500-0003 -# text = (You don't need to use their site, you can opt-out of sharing your information, you don't need to send stuff to anyone with a Gmail account, and if -- wonder of wonders -- you're worried that you might send something to someone who would forward an excerpt to someone who would then store it on a Gmail account... you have far, far too much time on your hands). -1 ( ( PUNCT -LRB- _ 5 punct 5:punct SpaceAfter=No -2 You you PRON PRP Case=Nom|Person=2|PronType=Prs 5 nsubj 5:nsubj|7:nsubj:xsubj _ -3 do do AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 5 aux 5:aux SpaceAfter=No -4 n't not PART RB _ 5 advmod 5:advmod _ -5 need need VERB VB VerbForm=Inf 0 root 0:root _ -6 to to PART TO _ 7 mark 7:mark _ -7 use use VERB VB VerbForm=Inf 5 xcomp 5:xcomp _ -8 their they PRON PRP$ Number=Plur|Person=3|Poss=Yes|PronType=Prs 9 nmod:poss 9:nmod:poss _ -9 site site NOUN NN Number=Sing 7 obj 7:obj SpaceAfter=No -10 , , PUNCT , _ 13 punct 13:punct _ -11 you you PRON PRP Case=Nom|Person=2|PronType=Prs 13 nsubj 13:nsubj _ -12 can can AUX MD VerbForm=Fin 13 aux 13:aux _ -13 opt opt VERB VB VerbForm=Inf 5 conj 5:conj:and SpaceAfter=No -14 - - PUNCT HYPH _ 13 punct 13:punct SpaceAfter=No -15 out out ADP RP _ 13 compound:prt 13:compound:prt _ -16 of of SCONJ IN _ 17 mark 17:mark _ -17 sharing share VERB VBG VerbForm=Ger 13 advcl 13:advcl:of _ -18 your you PRON PRP$ Person=2|Poss=Yes|PronType=Prs 19 nmod:poss 19:nmod:poss _ -19 information information NOUN NN Number=Sing 17 obj 17:obj SpaceAfter=No -20 , , PUNCT , _ 24 punct 24:punct _ -21 you you PRON PRP Case=Nom|Person=2|PronType=Prs 24 nsubj 24:nsubj|26:nsubj:xsubj _ -22 do do AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 24 aux 24:aux SpaceAfter=No -23 n't not PART RB _ 24 advmod 24:advmod _ -24 need need VERB VB VerbForm=Inf 5 conj 5:conj:and _ -25 to to PART TO _ 26 mark 26:mark _ -26 send send VERB VB VerbForm=Inf 24 xcomp 24:xcomp _ -27 stuff stuff NOUN NN Number=Sing 26 obj 26:obj _ -28 to to ADP IN _ 29 case 29:case _ -29 anyone anyone PRON NN Number=Sing 26 obl 26:obl:to _ -30 with with ADP IN _ 33 case 33:case _ -31 a a DET DT Definite=Ind|PronType=Art 33 det 33:det _ -32 Gmail Gmail PROPN NNP Number=Sing 33 compound 33:compound _ -33 account account NOUN NN Number=Sing 29 nmod 29:nmod:with SpaceAfter=No -34 , , PUNCT , _ 70 punct 70:punct _ -35 and and CCONJ CC _ 70 cc 70:cc _ -36 if if SCONJ IN _ 44 mark 44:mark _ -37 -- -- PUNCT , _ 44 punct 44:punct _ -38 wonder wonder NOUN NN Number=Sing 44 parataxis 44:parataxis _ -39 of of ADP IN _ 40 case 40:case _ -40 wonders wonder NOUN NNS Number=Plur 38 nmod 38:nmod:of _ -41 -- -- PUNCT , _ 44 punct 44:punct _ -42 you you PRON PRP Case=Nom|Person=2|PronType=Prs 44 nsubj 44:nsubj SpaceAfter=No -43 're be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 44 cop 44:cop _ -44 worried worried ADJ JJ Degree=Pos 70 advcl 70:advcl:if _ -45 that that SCONJ IN _ 48 mark 48:mark _ -46 you you PRON PRP Case=Nom|Person=2|PronType=Prs 48 nsubj 48:nsubj _ -47 might might AUX MD VerbForm=Fin 48 aux 48:aux _ -48 send send VERB VB VerbForm=Inf 44 ccomp 44:ccomp _ -49 something something PRON NN Number=Sing 48 obj 48:obj _ -50 to to ADP IN _ 51 case 51:case _ -51 someone someone PRON NN Number=Sing 48 obl 48:obl:to|54:nsubj _ -52 who who PRON WP PronType=Rel 54 nsubj 51:ref _ -53 would would AUX MD VerbForm=Fin 54 aux 54:aux _ -54 forward forward VERB VB VerbForm=Inf 51 acl:relcl 51:acl:relcl _ -55 an a DET DT Definite=Ind|PronType=Art 56 det 56:det _ -56 excerpt excerpt NOUN NN Number=Sing 54 obj 54:obj _ -57 to to ADP IN _ 58 case 58:case _ -58 someone someone PRON NN Number=Sing 54 obl 54:obl:to|62:nsubj _ -59 who who PRON WP PronType=Rel 62 nsubj 58:ref _ -60 would would AUX MD VerbForm=Fin 62 aux 62:aux _ -61 then then ADV RB PronType=Dem 62 advmod 62:advmod _ -62 store store VERB VB VerbForm=Inf 58 acl:relcl 58:acl:relcl _ -63 it it PRON PRP Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs 62 obj 62:obj _ -64 on on ADP IN _ 67 case 67:case _ -65 a a DET DT Definite=Ind|PronType=Art 67 det 67:det _ -66 Gmail Gmail PROPN NNP Number=Sing 67 compound 67:compound _ -67 account account NOUN NN Number=Sing 62 obl 62:obl:on SpaceAfter=No -68 ... ... PUNCT , _ 70 punct 70:punct _ -69 you you PRON PRP Case=Nom|Person=2|PronType=Prs 70 nsubj 70:nsubj _ -70 have have VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 5 conj 5:conj:and _ -71 far far ADV RB Degree=Pos 75 advmod 75:advmod SpaceAfter=No -72 , , PUNCT , _ 75 punct 75:punct _ -73 far far ADV RB Degree=Pos 75 advmod 75:advmod _ -74 too too ADV RB _ 75 advmod 75:advmod _ -75 much much ADJ JJ Degree=Pos 76 amod 76:amod _ -76 time time NOUN NN Number=Sing 70 obj 70:obj _ -77 on on ADP IN _ 79 case 79:case _ -78 your you PRON PRP$ Person=2|Poss=Yes|PronType=Prs 79 nmod:poss 79:nmod:poss _ -79 hands hand NOUN NNS Number=Plur 70 obl 70:obl:on SpaceAfter=No -80 ) ) PUNCT -RRB- _ 5 punct 5:punct SpaceAfter=No -81 . . PUNCT . _ 5 punct 5:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050224181500_ENG_20050224_181500-0004 -# text = However, this toolbar is really bad news. -1 However however ADV RB _ 8 advmod 8:advmod SpaceAfter=No -2 , , PUNCT , _ 8 punct 8:punct _ -3 this this DET DT Number=Sing|PronType=Dem 4 det 4:det _ -4 toolbar toolbar NOUN NN Number=Sing 8 nsubj 8:nsubj _ -5 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 8 cop 8:cop _ -6 really really ADV RB _ 7 advmod 7:advmod _ -7 bad bad ADJ JJ Degree=Pos 8 amod 8:amod _ -8 news news NOUN NN Number=Sing 0 root 0:root SpaceAfter=No -9 . . PUNCT . _ 8 punct 8:punct _ - -# sent_id = weblog-blogspot.com_marketview_20050224181500_ENG_20050224_181500-0005 -# text = On the other hand, it looks pretty cool. -1 On on ADP IN _ 4 case 4:case _ -2 the the DET DT Definite=Def|PronType=Art 4 det 4:det _ -3 other other ADJ JJ Degree=Pos 4 amod 4:amod _ -4 hand hand NOUN NN Number=Sing 7 obl 7:obl:on SpaceAfter=No -5 , , PUNCT , _ 7 punct 7:punct _ -6 it it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 7 nsubj 7:nsubj|9:nsubj:xsubj _ -7 looks look VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -8 pretty pretty ADV RB _ 9 advmod 9:advmod _ -9 cool cool ADJ JJ Degree=Pos 7 xcomp 7:xcomp SpaceAfter=No -10 . . PUNCT . _ 7 punct 7:punct _ - -# newdoc id = weblog-blogspot.com_grandpasgripes_20060413051000_ENG_20060413_051000 -# sent_id = weblog-blogspot.com_grandpasgripes_20060413051000_ENG_20060413_051000-0001 -# text = Iran says it is creating nuclear energy without wanting nuclear weapons. -1 Iran Iran PROPN NNP Number=Sing 2 nsubj 2:nsubj _ -2 says say VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -3 it it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 5 nsubj 5:nsubj _ -4 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 5 aux 5:aux _ -5 creating create VERB VBG VerbForm=Ger 2 ccomp 2:ccomp _ -6 nuclear nuclear ADJ JJ Degree=Pos 7 amod 7:amod _ -7 energy energy NOUN NN Number=Sing 5 obj 5:obj _ -8 without without SCONJ IN _ 9 mark 9:mark _ -9 wanting want VERB VBG VerbForm=Ger 5 advcl 5:advcl:without _ -10 nuclear nuclear ADJ JJ Degree=Pos 11 amod 11:amod _ -11 weapons weapon NOUN NNS Number=Plur 9 obj 9:obj SpaceAfter=No -12 . . PUNCT . _ 2 punct 2:punct _ - -# sent_id = weblog-blogspot.com_grandpasgripes_20060413051000_ENG_20060413_051000-0002 -# text = The United States doesn't believe the Iranian Government. -1 The the DET DT Definite=Def|PronType=Art 3 det 3:det _ -2 United United PROPN NNP Number=Sing 3 compound 3:compound _ -3 States States PROPN NNP Number=Sing 6 nsubj 6:nsubj _ -4 does do AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 aux 6:aux SpaceAfter=No -5 n't not PART RB _ 6 advmod 6:advmod _ -6 believe believe VERB VB VerbForm=Inf 0 root 0:root _ -7 the the DET DT Definite=Def|PronType=Art 9 det 9:det _ -8 Iranian iranian ADJ JJ Degree=Pos 9 amod 9:amod _ -9 Government government NOUN NN Number=Sing 6 obj 6:obj SpaceAfter=No -10 . . PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-blogspot.com_grandpasgripes_20060413051000_ENG_20060413_051000-0003 -# text = One can suspect the Iranian Government. -1 One one PRON PRP _ 3 nsubj 3:nsubj _ -2 can can AUX MD VerbForm=Fin 3 aux 3:aux _ -3 suspect suspect VERB VB VerbForm=Inf 0 root 0:root _ -4 the the DET DT Definite=Def|PronType=Art 6 det 6:det _ -5 Iranian iranian ADJ JJ Degree=Pos 6 amod 6:amod _ -6 Government government NOUN NN Number=Sing 3 obj 3:obj SpaceAfter=No -7 . . PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-blogspot.com_grandpasgripes_20060413051000_ENG_20060413_051000-0004 -# text = But there is no proof . -1 But but CCONJ CC _ 3 cc 3:cc _ -2 there there PRON EX _ 3 expl 3:expl _ -3 is be VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -4 no no DET DT _ 5 det 5:det _ -5 proof proof NOUN NN Number=Sing 3 nsubj 3:nsubj _ -6 . . PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-blogspot.com_grandpasgripes_20060413051000_ENG_20060413_051000-0005 -# text = I read an Article in Time magazine accusing the Iranian Government of being willing to start a nuclear war and I sympathise with the Article. -1 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 2 nsubj 2:nsubj _ -2 read read VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -3 an a DET DT Definite=Ind|PronType=Art 4 det 4:det _ -4 Article article NOUN NN Number=Sing 2 obj 2:obj _ -5 in in ADP IN _ 7 case 7:case _ -6 Time Time PROPN NNP Number=Sing 7 compound 7:compound _ -7 magazine magazine PROPN NNP Number=Sing 4 nmod 4:nmod:in _ -8 accusing accuse VERB VBG VerbForm=Ger 4 acl 4:acl _ -9 the the DET DT Definite=Def|PronType=Art 11 det 11:det _ -10 Iranian iranian ADJ JJ Degree=Pos 11 amod 11:amod _ -11 Government government NOUN NN Number=Sing 8 obj 8:obj _ -12 of of SCONJ IN _ 14 mark 14:mark _ -13 being be AUX VBG VerbForm=Ger 14 cop 14:cop _ -14 willing willing ADJ JJ Degree=Pos 8 advcl 8:advcl:of _ -15 to to PART TO _ 16 mark 16:mark _ -16 start start VERB VB VerbForm=Inf 14 xcomp 14:xcomp _ -17 a a DET DT Definite=Ind|PronType=Art 19 det 19:det _ -18 nuclear nuclear ADJ JJ Degree=Pos 19 amod 19:amod _ -19 war war NOUN NN Number=Sing 16 obj 16:obj _ -20 and and CCONJ CC _ 22 cc 22:cc _ -21 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 22 nsubj 22:nsubj _ -22 sympathise sympathise VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 2 conj 2:conj:and _ -23 with with ADP IN _ 25 case 25:case _ -24 the the DET DT Definite=Def|PronType=Art 25 det 25:det _ -25 Article article NOUN NN Number=Sing 22 obl 22:obl:with SpaceAfter=No -26 . . PUNCT . _ 2 punct 2:punct _ - -# sent_id = weblog-blogspot.com_grandpasgripes_20060413051000_ENG_20060413_051000-0006 -# text = They are certainly being nasty to the United Nations Security Council in connection with the anti-proliferation treaty. -1 They they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 5 nsubj 5:nsubj _ -2 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 5 aux 5:aux _ -3 certainly certainly ADV RB _ 5 advmod 5:advmod _ -4 being be AUX VBG VerbForm=Ger 5 cop 5:cop _ -5 nasty nasty ADJ JJ Degree=Pos 0 root 0:root _ -6 to to ADP IN _ 11 case 11:case _ -7 the the DET DT Definite=Def|PronType=Art 11 det 11:det _ -8 United United PROPN NNP Number=Sing 9 compound 9:compound _ -9 Nations Nations PROPN NNP Number=Sing 11 compound 11:compound _ -10 Security Security PROPN NNP Number=Sing 11 compound 11:compound _ -11 Council Council PROPN NNP Number=Sing 5 obl 5:obl:to _ -12 in in ADP IN _ 13 case 13:case _ -13 connection connection NOUN NN Number=Sing 5 obl 5:obl:in _ -14 with with ADP IN _ 17 case 17:case _ -15 the the DET DT Definite=Def|PronType=Art 17 det 17:det _ -16 anti-proliferation anti-proliferation NOUN NN Number=Sing 17 compound 17:compound _ -17 treaty treaty NOUN NN Number=Sing 13 nmod 13:nmod:with SpaceAfter=No -18 . . PUNCT . _ 5 punct 5:punct _ - diff --git a/tests/test_corpus/en_ewt-ud-test.txt b/tests/test_corpus/en_ewt-ud-test.txt deleted file mode 100644 index 011a83e64..000000000 --- a/tests/test_corpus/en_ewt-ud-test.txt +++ /dev/null @@ -1,38 +0,0 @@ -What if Google Morphed Into GoogleOS? What if Google expanded on its -search-engine (and now e-mail) wares into a full-fledged operating system? [via -Microsoft Watch from Mary Jo Foley ] - -(And, by the way, is anybody else just a little nostalgic for the days when that -was a good thing?) This BuzzMachine post argues that Google's rush toward -ubiquity might backfire -- which we've all heard before, but it's particularly -well-put in this post. Google is a nice search engine. Does anybody use it for -anything else? They own blogger, of course. Is that a money maker? I'm staying -away from the stock. - -I doubt the very few who actually read my blog have not come across this yet, -but I figured I would put it out there anyways. John Donovan from Argghhh! has -put out a excellent slide show on what was actually found and fought for in -Fallujah. Click here To view it. He makes some good observations on a few of the -pic's. One of the pictures shows a flag that was found in Fallujah. On the next -two pictures he took screenshots of two beheading video's. Compare the flags to -the Fallujah one. You have to see these slides....they are amazing. This -Fallujah operation my turn out to be the most important operation done by the US -Military since the end of the war. - -Let me join the chorus of annoyance over Google's new toolbar , which, as noted -in the linked article, commits just about every sin an online marketer could -commit, and makes up a few new ones besides. I'm not fond of the -Google-hates-privacy argument (You don't need to use their site, you can opt-out -of sharing your information, you don't need to send stuff to anyone with a Gmail -account, and if -- wonder of wonders -- you're worried that you might send -something to someone who would forward an excerpt to someone who would then -store it on a Gmail account... you have far, far too much time on your hands). -However, this toolbar is really bad news. On the other hand, it looks pretty -cool. - -Iran says it is creating nuclear energy without wanting nuclear weapons. The -United States doesn't believe the Iranian Government. One can suspect the -Iranian Government. But there is no proof . I read an Article in Time magazine -accusing the Iranian Government of being willing to start a nuclear war and I -sympathise with the Article. They are certainly being nasty to the United -Nations Security Council in connection with the anti-proliferation treaty. diff --git a/tests/test_corpus/en_ewt-ud-train.conllu b/tests/test_corpus/en_ewt-ud-train.conllu deleted file mode 100644 index 97b702eca..000000000 --- a/tests/test_corpus/en_ewt-ud-train.conllu +++ /dev/null @@ -1,1269 +0,0 @@ -# newdoc id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000 -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001 -# text = Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border. -1 Al Al PROPN NNP Number=Sing 0 root 0:root SpaceAfter=No -2 - - PUNCT HYPH _ 1 punct 1:punct SpaceAfter=No -3 Zaman Zaman PROPN NNP Number=Sing 1 flat 1:flat _ -4 : : PUNCT : _ 1 punct 1:punct _ -5 American american ADJ JJ Degree=Pos 6 amod 6:amod _ -6 forces force NOUN NNS Number=Plur 7 nsubj 7:nsubj _ -7 killed kill VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 parataxis 1:parataxis _ -8 Shaikh Shaikh PROPN NNP Number=Sing 7 obj 7:obj _ -9 Abdullah Abdullah PROPN NNP Number=Sing 8 flat 8:flat _ -10 al al PROPN NNP Number=Sing 8 flat 8:flat SpaceAfter=No -11 - - PUNCT HYPH _ 8 punct 8:punct SpaceAfter=No -12 Ani Ani PROPN NNP Number=Sing 8 flat 8:flat SpaceAfter=No -13 , , PUNCT , _ 8 punct 8:punct _ -14 the the DET DT Definite=Def|PronType=Art 15 det 15:det _ -15 preacher preacher NOUN NN Number=Sing 8 appos 8:appos _ -16 at at ADP IN _ 18 case 18:case _ -17 the the DET DT Definite=Def|PronType=Art 18 det 18:det _ -18 mosque mosque NOUN NN Number=Sing 7 obl 7:obl:at _ -19 in in ADP IN _ 21 case 21:case _ -20 the the DET DT Definite=Def|PronType=Art 21 det 21:det _ -21 town town NOUN NN Number=Sing 18 nmod 18:nmod:in _ -22 of of ADP IN _ 23 case 23:case _ -23 Qaim Qaim PROPN NNP Number=Sing 21 nmod 21:nmod:of SpaceAfter=No -24 , , PUNCT , _ 21 punct 21:punct _ -25 near near ADP IN _ 28 case 28:case _ -26 the the DET DT Definite=Def|PronType=Art 28 det 28:det _ -27 Syrian syrian ADJ JJ Degree=Pos 28 amod 28:amod _ -28 border border NOUN NN Number=Sing 21 nmod 21:nmod:near SpaceAfter=No -29 . . PUNCT . _ 1 punct 1:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002 -# text = [This killing of a respected cleric will be causing us trouble for years to come.] -1 [ [ PUNCT -LRB- _ 10 punct 10:punct SpaceAfter=No -2 This this DET DT Number=Sing|PronType=Dem 3 det 3:det _ -3 killing killing NOUN NN Number=Sing 10 nsubj 10:nsubj _ -4 of of ADP IN _ 7 case 7:case _ -5 a a DET DT Definite=Ind|PronType=Art 7 det 7:det _ -6 respected respected ADJ JJ Degree=Pos 7 amod 7:amod _ -7 cleric cleric NOUN NN Number=Sing 3 nmod 3:nmod:of _ -8 will will AUX MD VerbForm=Fin 10 aux 10:aux _ -9 be be AUX VB VerbForm=Inf 10 aux 10:aux _ -10 causing cause VERB VBG VerbForm=Ger 0 root 0:root _ -11 us we PRON PRP Case=Acc|Number=Plur|Person=1|PronType=Prs 10 iobj 10:iobj _ -12 trouble trouble NOUN NN Number=Sing 10 obj 10:obj _ -13 for for ADP IN _ 14 case 14:case _ -14 years year NOUN NNS Number=Plur 10 obl 10:obl:for _ -15 to to PART TO _ 16 mark 16:mark _ -16 come come VERB VB VerbForm=Inf 14 acl 14:acl:to SpaceAfter=No -17 . . PUNCT . _ 10 punct 10:punct SpaceAfter=No -18 ] ] PUNCT -RRB- _ 10 punct 10:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0003 -# text = DPA: Iraqi authorities announced that they had busted up 3 terrorist cells operating in Baghdad. -1 DPA DPA PROPN NNP Number=Sing 0 root 0:root SpaceAfter=No -2 : : PUNCT : _ 1 punct 1:punct _ -3 Iraqi iraqi ADJ JJ Degree=Pos 4 amod 4:amod _ -4 authorities authority NOUN NNS Number=Plur 5 nsubj 5:nsubj _ -5 announced announce VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 parataxis 1:parataxis _ -6 that that SCONJ IN _ 9 mark 9:mark _ -7 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 9 nsubj 9:nsubj _ -8 had have AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 9 aux 9:aux _ -9 busted bust VERB VBN Tense=Past|VerbForm=Part 5 ccomp 5:ccomp _ -10 up up ADP RP _ 9 compound:prt 9:compound:prt _ -11 3 3 NUM CD NumType=Card 13 nummod 13:nummod _ -12 terrorist terrorist ADJ JJ Degree=Pos 13 amod 13:amod _ -13 cells cell NOUN NNS Number=Plur 9 obj 9:obj _ -14 operating operate VERB VBG VerbForm=Ger 13 acl 13:acl _ -15 in in ADP IN _ 16 case 16:case _ -16 Baghdad Baghdad PROPN NNP Number=Sing 14 obl 14:obl:in SpaceAfter=No -17 . . PUNCT . _ 1 punct 1:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0004 -# text = Two of them were being run by 2 officials of the Ministry of the Interior! -1 Two two NUM CD NumType=Card 6 nsubj:pass 6:nsubj:pass _ -2 of of ADP IN _ 3 case 3:case _ -3 them they PRON PRP Case=Acc|Number=Plur|Person=3|PronType=Prs 1 nmod 1:nmod:of _ -4 were be AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 6 aux 6:aux _ -5 being be AUX VBG VerbForm=Ger 6 aux:pass 6:aux:pass _ -6 run run VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ -7 by by ADP IN _ 9 case 9:case _ -8 2 2 NUM CD NumType=Card 9 nummod 9:nummod _ -9 officials official NOUN NNS Number=Plur 6 obl 6:obl:by _ -10 of of ADP IN _ 12 case 12:case _ -11 the the DET DT Definite=Def|PronType=Art 12 det 12:det _ -12 Ministry Ministry PROPN NNP Number=Sing 9 nmod 9:nmod:of _ -13 of of ADP IN _ 15 case 15:case _ -14 the the DET DT Definite=Def|PronType=Art 15 det 15:det _ -15 Interior Interior PROPN NNP Number=Sing 12 nmod 12:nmod:of SpaceAfter=No -16 ! ! PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0005 -# text = The MoI in Iraq is equivalent to the US FBI, so this would be like having J. Edgar Hoover unwittingly employ at a high level members of the Weathermen bombers back in the 1960s. -1 The the DET DT Definite=Def|PronType=Art 2 det 2:det _ -2 MoI MoI PROPN NNP Number=Sing 6 nsubj 6:nsubj _ -3 in in ADP IN _ 4 case 4:case _ -4 Iraq Iraq PROPN NNP Number=Sing 2 nmod 2:nmod:in _ -5 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 cop 6:cop _ -6 equivalent equivalent ADJ JJ Degree=Pos 0 root 0:root _ -7 to to ADP IN _ 10 case 10:case _ -8 the the DET DT Definite=Def|PronType=Art 10 det 10:det _ -9 US US PROPN NNP Number=Sing 10 compound 10:compound _ -10 FBI FBI PROPN NNP Number=Sing 6 obl 6:obl:to SpaceAfter=No -11 , , PUNCT , _ 6 punct 6:punct _ -12 so so ADV RB _ 17 advmod 17:advmod _ -13 this this PRON DT Number=Sing|PronType=Dem 15 nsubj 15:nsubj _ -14 would would AUX MD VerbForm=Fin 15 aux 15:aux _ -15 be be VERB VB VerbForm=Inf 6 parataxis 6:parataxis _ -16 like like SCONJ IN _ 17 mark 17:mark _ -17 having have VERB VBG VerbForm=Ger 15 advcl 15:advcl:like _ -18 J. J. PROPN NNP Number=Sing 22 nsubj 22:nsubj _ -19 Edgar Edgar PROPN NNP Number=Sing 18 flat 18:flat _ -20 Hoover Hoover PROPN NNP Number=Sing 18 flat 18:flat _ -21 unwittingly unwittingly ADV RB _ 22 advmod 22:advmod _ -22 employ employ VERB VB VerbForm=Inf 17 ccomp 17:ccomp _ -23 at at ADP IN _ 26 case 26:case _ -24 a a DET DT Definite=Ind|PronType=Art 26 det 26:det _ -25 high high ADJ JJ Degree=Pos 26 amod 26:amod _ -26 level level NOUN NN Number=Sing 22 obl 22:obl:at _ -27 members member NOUN NNS Number=Plur 22 obj 22:obj _ -28 of of ADP IN _ 31 case 31:case _ -29 the the DET DT Definite=Def|PronType=Art 31 det 31:det _ -30 Weathermen Weathermen PROPN NNPS Number=Plur 31 compound 31:compound _ -31 bombers bomber NOUN NNS Number=Plur 27 nmod 27:nmod:of _ -32 back back ADV RB _ 35 advmod 35:advmod _ -33 in in ADP IN _ 35 case 35:case _ -34 the the DET DT Definite=Def|PronType=Art 35 det 35:det _ -35 1960s 1960 NOUN NNS Number=Plur 22 obl 22:obl:in SpaceAfter=No -36 . . PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0006 -# text = The third was being run by the head of an investment firm. -1 The the DET DT Definite=Def|PronType=Art 2 det 2:det _ -2 third third ADJ JJ Degree=Pos|NumType=Ord 5 nsubj:pass 5:nsubj:pass _ -3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 5 aux 5:aux _ -4 being be AUX VBG VerbForm=Ger 5 aux:pass 5:aux:pass _ -5 run run VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ -6 by by ADP IN _ 8 case 8:case _ -7 the the DET DT Definite=Def|PronType=Art 8 det 8:det _ -8 head head NOUN NN Number=Sing 5 obl 5:obl:by _ -9 of of ADP IN _ 12 case 12:case _ -10 an a DET DT Definite=Ind|PronType=Art 12 det 12:det _ -11 investment investment NOUN NN Number=Sing 12 compound 12:compound _ -12 firm firm NOUN NN Number=Sing 8 nmod 8:nmod:of SpaceAfter=No -13 . . PUNCT . _ 5 punct 5:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0007 -# text = You wonder if he was manipulating the market with his bombing targets. -1 You you PRON PRP Case=Nom|Person=2|PronType=Prs 2 nsubj 2:nsubj _ -2 wonder wonder VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 0 root 0:root _ -3 if if SCONJ IN _ 6 mark 6:mark _ -4 he he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 6 nsubj 6:nsubj _ -5 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 6 aux 6:aux _ -6 manipulating manipulate VERB VBG Tense=Pres|VerbForm=Part 2 ccomp 2:ccomp _ -7 the the DET DT Definite=Def|PronType=Art 8 det 8:det _ -8 market market NOUN NN Number=Sing 6 obj 6:obj _ -9 with with ADP IN _ 12 case 12:case _ -10 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 12 nmod:poss 12:nmod:poss _ -11 bombing bombing NOUN NN Number=Sing 12 compound 12:compound _ -12 targets target NOUN NNS Number=Plur 6 obl 6:obl:with SpaceAfter=No -13 . . PUNCT . _ 2 punct 2:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0008 -# text = The cells were operating in the Ghazaliyah and al-Jihad districts of the capital. -1 The the DET DT Definite=Def|PronType=Art 2 det 2:det _ -2 cells cell NOUN NNS Number=Plur 4 nsubj 4:nsubj _ -3 were be AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 4 aux 4:aux _ -4 operating operate VERB VBG Tense=Pres|VerbForm=Part 0 root 0:root _ -5 in in ADP IN _ 12 case 12:case _ -6 the the DET DT Definite=Def|PronType=Art 12 det 12:det _ -7 Ghazaliyah Ghazaliyah PROPN NNP Number=Sing 12 compound 12:compound _ -8 and and CCONJ CC _ 11 cc 11:cc _ -9 al al PROPN NNP Number=Sing 11 compound 11:compound SpaceAfter=No -10 - - PUNCT HYPH _ 11 punct 11:punct SpaceAfter=No -11 Jihad Jihad PROPN NNP Number=Sing 7 conj 7:conj:and|12:compound _ -12 districts district NOUN NNS Number=Plur 4 obl 4:obl:in _ -13 of of ADP IN _ 15 case 15:case _ -14 the the DET DT Definite=Def|PronType=Art 15 det 15:det _ -15 capital capital NOUN NN Number=Sing 12 nmod 12:nmod:of SpaceAfter=No -16 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0009 -# text = Although the announcement was probably made to show progress in identifying and breaking up terror cells, I don't find the news that the Baathists continue to penetrate the Iraqi government very hopeful. -1 Although although SCONJ IN _ 6 mark 6:mark _ -2 the the DET DT Definite=Def|PronType=Art 3 det 3:det _ -3 announcement announcement NOUN NN Number=Sing 6 nsubj:pass 6:nsubj:pass|8:nsubj:xsubj _ -4 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 6 aux:pass 6:aux:pass _ -5 probably probably ADV RB _ 6 advmod 6:advmod _ -6 made make VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 21 advcl 21:advcl:although _ -7 to to PART TO _ 8 mark 8:mark _ -8 show show VERB VB VerbForm=Inf 6 xcomp 6:xcomp _ -9 progress progress NOUN NN Number=Sing 8 obj 8:obj _ -10 in in SCONJ IN _ 11 mark 11:mark _ -11 identifying identify VERB VBG VerbForm=Ger 9 acl 9:acl:in _ -12 and and CCONJ CC _ 13 cc 13:cc _ -13 breaking break VERB VBG VerbForm=Ger 11 conj 9:acl:in|11:conj:and _ -14 up up ADP RP _ 13 compound:prt 13:compound:prt _ -15 terror terror NOUN NN Number=Sing 16 compound 16:compound _ -16 cells cell NOUN NNS Number=Plur 11 obj 11:obj|13:obj SpaceAfter=No -17 , , PUNCT , _ 21 punct 21:punct _ -18 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 21 nsubj 21:nsubj _ -19 do do AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 21 aux 21:aux SpaceAfter=No -20 n't not PART RB _ 21 advmod 21:advmod _ -21 find find VERB VB VerbForm=Inf 0 root 0:root _ -22 the the DET DT Definite=Def|PronType=Art 23 det 23:det _ -23 news news NOUN NN Number=Sing 21 obj 21:obj|34:nsubj:xsubj _ -24 that that SCONJ IN _ 27 mark 27:mark _ -25 the the DET DT Definite=Def|PronType=Art 26 det 26:det _ -26 Baathists Baathists PROPN NNPS Number=Plur 27 nsubj 27:nsubj|29:nsubj:xsubj _ -27 continue continue VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 23 acl 23:acl:that _ -28 to to PART TO _ 29 mark 29:mark _ -29 penetrate penetrate VERB VB VerbForm=Inf 27 xcomp 27:xcomp _ -30 the the DET DT Definite=Def|PronType=Art 32 det 32:det _ -31 Iraqi iraqi ADJ JJ Degree=Pos 32 amod 32:amod _ -32 government government NOUN NN Number=Sing 29 obj 29:obj _ -33 very very ADV RB _ 34 advmod 34:advmod _ -34 hopeful hopeful ADJ JJ Degree=Pos 21 xcomp 21:xcomp SpaceAfter=No -35 . . PUNCT . _ 21 punct 21:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0010 -# text = It reminds me too much of the ARVN officers who were secretly working for the other side in Vietnam. -1 It it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 2 nsubj 2:nsubj _ -2 reminds remind VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -3 me I PRON PRP Case=Acc|Number=Sing|Person=1|PronType=Prs 2 obj 2:obj _ -4 too too ADV RB _ 5 advmod 5:advmod _ -5 much much ADV RB _ 2 advmod 2:advmod _ -6 of of ADP IN _ 9 case 9:case _ -7 the the DET DT Definite=Def|PronType=Art 9 det 9:det _ -8 ARVN ARVN PROPN NNP Number=Sing 9 compound 9:compound _ -9 officers officer NOUN NNS Number=Plur 2 obl 2:obl:of|13:nsubj _ -10 who who PRON WP PronType=Rel 13 nsubj 9:ref _ -11 were be AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 13 aux 13:aux _ -12 secretly secretly ADV RB _ 13 advmod 13:advmod _ -13 working work VERB VBG Tense=Pres|VerbForm=Part 9 acl:relcl 9:acl:relcl _ -14 for for ADP IN _ 17 case 17:case _ -15 the the DET DT Definite=Def|PronType=Art 17 det 17:det _ -16 other other ADJ JJ Degree=Pos 17 amod 17:amod _ -17 side side NOUN NN Number=Sing 13 obl 13:obl:for _ -18 in in ADP IN _ 19 case 19:case _ -19 Vietnam Vietnam PROPN NNP Number=Sing 13 obl 13:obl:in SpaceAfter=No -20 . . PUNCT . _ 2 punct 2:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0011 -# text = Al-Zaman : Guerrillas killed a member of the Kurdistan Democratic Party after kidnapping him in Mosul. -1 Al Al PROPN NNP Number=Sing 0 root 0:root SpaceAfter=No -2 - - PUNCT HYPH _ 1 punct 1:punct SpaceAfter=No -3 Zaman Zaman PROPN NNP Number=Sing 1 flat 1:flat _ -4 : : PUNCT : _ 1 punct 1:punct _ -5 Guerrillas guerrilla NOUN NNS Number=Plur 6 nsubj 6:nsubj _ -6 killed kill VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 parataxis 1:parataxis _ -7 a a DET DT Definite=Ind|PronType=Art 8 det 8:det _ -8 member member NOUN NN Number=Sing 6 obj 6:obj _ -9 of of ADP IN _ 13 case 13:case _ -10 the the DET DT Definite=Def|PronType=Art 13 det 13:det _ -11 Kurdistan Kurdistan PROPN NNP Number=Sing 13 compound 13:compound _ -12 Democratic Democratic PROPN NNP Number=Sing 13 compound 13:compound _ -13 Party Party PROPN NNP Number=Sing 8 nmod 8:nmod:of _ -14 after after SCONJ IN _ 15 mark 15:mark _ -15 kidnapping kidnap VERB VBG VerbForm=Ger 6 advcl 6:advcl:after _ -16 him he PRON PRP Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs 15 obj 15:obj _ -17 in in ADP IN _ 18 case 18:case _ -18 Mosul Mosul PROPN NNP Number=Sing 15 obl 15:obl:in SpaceAfter=No -19 . . PUNCT . _ 1 punct 1:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0012 -# text = The police commander of Ninevah Province announced that bombings had declined 80 percent in Mosul, whereas there had been a big jump in the number of kidnappings. -1 The the DET DT Definite=Def|PronType=Art 3 det 3:det _ -2 police police NOUN NN Number=Sing 3 compound 3:compound _ -3 commander commander NOUN NN Number=Sing 7 nsubj 7:nsubj _ -4 of of ADP IN _ 6 case 6:case _ -5 Ninevah Ninevah PROPN NNP Number=Sing 6 compound 6:compound _ -6 Province Province PROPN NNP Number=Sing 3 nmod 3:nmod:of _ -7 announced announce VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -8 that that SCONJ IN _ 11 mark 11:mark _ -9 bombings bombing NOUN NNS Number=Plur 11 nsubj 11:nsubj _ -10 had have AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 11 aux 11:aux _ -11 declined decline VERB VBN Tense=Past|VerbForm=Part 7 ccomp 7:ccomp _ -12 80 80 NUM CD NumType=Card 13 nummod 13:nummod _ -13 percent percent NOUN NN Number=Sing 11 obj 11:obj _ -14 in in ADP IN _ 15 case 15:case _ -15 Mosul Mosul PROPN NNP Number=Sing 11 obl 11:obl:in SpaceAfter=No -16 , , PUNCT , _ 11 punct 11:punct _ -17 whereas whereas SCONJ IN _ 20 mark 20:mark _ -18 there there PRON EX _ 20 expl 20:expl _ -19 had have AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 20 aux 20:aux _ -20 been be VERB VBN Tense=Past|VerbForm=Part 11 advcl 11:advcl:whereas _ -21 a a DET DT Definite=Ind|PronType=Art 23 det 23:det _ -22 big big ADJ JJ Degree=Pos 23 amod 23:amod _ -23 jump jump NOUN NN Number=Sing 20 nsubj 20:nsubj _ -24 in in ADP IN _ 26 case 26:case _ -25 the the DET DT Definite=Def|PronType=Art 26 det 26:det _ -26 number number NOUN NN Number=Sing 23 nmod 23:nmod:in _ -27 of of ADP IN _ 28 case 28:case _ -28 kidnappings kidnapping NOUN NNS Number=Plur 26 nmod 26:nmod:of SpaceAfter=No -29 . . PUNCT . _ 7 punct 7:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0013 -# text = On Wednesday guerrillas had kidnapped a cosmetic surgeon and his wife while they were on their way home. -1 On on ADP IN _ 2 case 2:case _ -2 Wednesday Wednesday PROPN NNP Number=Sing 5 obl 5:obl:on _ -3 guerrillas guerrilla NOUN NNS Number=Plur 5 nsubj 5:nsubj _ -4 had have AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 5 aux 5:aux _ -5 kidnapped kidnap VERB VBN Tense=Past|VerbForm=Part 0 root 0:root _ -6 a a DET DT Definite=Ind|PronType=Art 8 det 8:det _ -7 cosmetic cosmetic ADJ JJ Degree=Pos 8 amod 8:amod _ -8 surgeon surgeon NOUN NN Number=Sing 5 obj 5:obj _ -9 and and CCONJ CC _ 11 cc 11:cc _ -10 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 11 nmod:poss 11:nmod:poss _ -11 wife wife NOUN NN Number=Sing 8 conj 5:obj|8:conj:and _ -12 while while SCONJ IN _ 17 mark 17:mark _ -13 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 17 nsubj 17:nsubj _ -14 were be AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 17 cop 17:cop _ -15 on on ADP IN _ 17 case 17:case _ -16 their they PRON PRP$ Number=Plur|Person=3|Poss=Yes|PronType=Prs 17 nmod:poss 17:nmod:poss _ -17 way way NOUN NN Number=Sing 5 advcl 5:advcl:on _ -18 home home ADV RB _ 17 advmod 17:advmod SpaceAfter=No -19 . . PUNCT . _ 5 punct 5:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0014 -# text = In Suwayrah, Kut Province, two car bombs were discovered before they could be detonated. -1 In in ADP IN _ 2 case 2:case _ -2 Suwayrah Suwayrah PROPN NNP Number=Sing 11 obl 11:obl:in SpaceAfter=No -3 , , PUNCT , _ 2 punct 2:punct _ -4 Kut Kut PROPN NNP Number=Sing 5 compound 5:compound _ -5 Province Province PROPN NNP Number=Sing 2 appos 2:appos SpaceAfter=No -6 , , PUNCT , _ 11 punct 11:punct _ -7 two two NUM CD NumType=Card 9 nummod 9:nummod _ -8 car car NOUN NN Number=Sing 9 compound 9:compound _ -9 bombs bombs NOUN NN Number=Sing 11 nsubj:pass 11:nsubj:pass _ -10 were be AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 11 aux:pass 11:aux:pass _ -11 discovered discover VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ -12 before before SCONJ IN _ 16 mark 16:mark _ -13 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 16 nsubj:pass 16:nsubj:pass _ -14 could could AUX MD VerbForm=Fin 16 aux 16:aux _ -15 be be AUX VB VerbForm=Inf 16 aux:pass 16:aux:pass _ -16 detonated detonate VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 11 advcl 11:advcl:before SpaceAfter=No -17 . . PUNCT . _ 11 punct 11:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0015 -# text = (Kut is in southeastern Iraq and has an overwhelmingly Shiite population, who are on the lookout for Baathist saboteurs and willingly turn them in. -1 ( ( PUNCT -LRB- _ 6 punct 6:punct SpaceAfter=No -2 Kut Kut PROPN NNP Number=Sing 6 nsubj 6:nsubj|8:nsubj _ -3 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 cop 6:cop _ -4 in in ADP IN _ 6 case 6:case _ -5 southeastern southeastern ADJ JJ Degree=Pos 6 amod 6:amod _ -6 Iraq Iraq PROPN NNP Number=Sing 0 root 0:root _ -7 and and CCONJ CC _ 8 cc 8:cc _ -8 has have VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 conj 6:conj:and _ -9 an a DET DT Definite=Ind|PronType=Art 12 det 12:det _ -10 overwhelmingly overwhelmingly ADV RB _ 11 advmod 11:advmod _ -11 Shiite shiite ADJ JJ Degree=Pos 12 amod 12:amod _ -12 population population NOUN NN Number=Sing 8 obj 8:obj|18:nsubj|24:nsubj SpaceAfter=No -13 , , PUNCT , _ 12 punct 12:punct _ -14 who who PRON WP PronType=Rel 18 nsubj 12:ref _ -15 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 18 cop 18:cop _ -16 on on ADP IN _ 18 case 18:case _ -17 the the DET DT Definite=Def|PronType=Art 18 det 18:det _ -18 lookout lookout NOUN NN Number=Sing 12 acl:relcl 12:acl:relcl _ -19 for for ADP IN _ 21 case 21:case _ -20 Baathist Baathist PROPN NNP Number=Sing 21 compound 21:compound _ -21 saboteurs saboteur NOUN NNS Number=Plur 18 nmod 18:nmod:for _ -22 and and CCONJ CC _ 24 cc 24:cc _ -23 willingly willingly ADV RB _ 24 advmod 24:advmod _ -24 turn turn VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 18 conj 12:acl:relcl|18:conj:and _ -25 them they PRON PRP Case=Acc|Number=Plur|Person=3|PronType=Prs 24 obj 24:obj _ -26 in in ADP RP _ 24 compound:prt 24:compound:prt SpaceAfter=No -27 . . PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0016 -# text = This willingness is the main difference in the number of bombings in the south as opposed to the center-north of the country.) -1 This this DET DT Number=Sing|PronType=Dem 2 det 2:det _ -2 willingness willingness NOUN NN Number=Sing 6 nsubj 6:nsubj _ -3 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 cop 6:cop _ -4 the the DET DT Definite=Def|PronType=Art 6 det 6:det _ -5 main main ADJ JJ Degree=Pos 6 amod 6:amod _ -6 difference difference NOUN NN Number=Sing 0 root 0:root _ -7 in in ADP IN _ 9 case 9:case _ -8 the the DET DT Definite=Def|PronType=Art 9 det 9:det _ -9 number number NOUN NN Number=Sing 6 nmod 6:nmod:in _ -10 of of ADP IN _ 11 case 11:case _ -11 bombings bombing NOUN NNS Number=Plur 9 nmod 9:nmod:of _ -12 in in ADP IN _ 14 case 14:case _ -13 the the DET DT Definite=Def|PronType=Art 14 det 14:det _ -14 south south NOUN NN Number=Sing 11 nmod 11:nmod:in _ -15 as as SCONJ IN _ 21 case 21:case _ -16 opposed oppose VERB VBN Tense=Past|VerbForm=Part 15 fixed 15:fixed _ -17 to to ADP IN _ 15 fixed 15:fixed _ -18 the the DET DT Definite=Def|PronType=Art 21 det 21:det _ -19 center center NOUN NN Number=Sing 21 compound 21:compound SpaceAfter=No -20 - - PUNCT HYPH _ 21 punct 21:punct SpaceAfter=No -21 north north NOUN NN Number=Sing 14 nmod 14:nmod:as_oppose_to _ -22 of of ADP IN _ 24 case 24:case _ -23 the the DET DT Definite=Def|PronType=Art 24 det 24:det _ -24 country country NOUN NN Number=Sing 21 nmod 21:nmod:of SpaceAfter=No -25 . . PUNCT . _ 6 punct 6:punct SpaceAfter=No -26 ) ) PUNCT -RRB- _ 6 punct 6:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0017 -# text = In Baghdad Kadhim Talal Husain, assistant dean at the School of Education at Mustansiriyah University, was assassinated with his driver in the Salikh district. -1 In in ADP IN _ 2 case 2:case _ -2 Baghdad Baghdad PROPN NNP Number=Sing 19 obl 19:obl:in _ -3 Kadhim Kadhim PROPN NNP Number=Sing 19 nsubj:pass 19:nsubj:pass _ -4 Talal Talal PROPN NNP Number=Sing 3 flat 3:flat _ -5 Husain Husain PROPN NNP Number=Sing 3 flat 3:flat SpaceAfter=No -6 , , PUNCT , _ 3 punct 3:punct _ -7 assistant assistant ADJ JJ Degree=Pos 8 amod 8:amod _ -8 dean dean NOUN NN Number=Sing 3 appos 3:appos _ -9 at at ADP IN _ 11 case 11:case _ -10 the the DET DT Definite=Def|PronType=Art 11 det 11:det _ -11 School School PROPN NNP Number=Sing 8 nmod 8:nmod:at _ -12 of of ADP IN _ 13 case 13:case _ -13 Education Education PROPN NNP Number=Sing 11 nmod 11:nmod:of _ -14 at at ADP IN _ 16 case 16:case _ -15 Mustansiriyah Mustansiriyah PROPN NNP Number=Sing 16 compound 16:compound _ -16 University University PROPN NNP Number=Sing 11 nmod 11:nmod:at SpaceAfter=No -17 , , PUNCT , _ 19 punct 19:punct _ -18 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 19 aux:pass 19:aux:pass _ -19 assassinated assassinate VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ -20 with with ADP IN _ 22 case 22:case _ -21 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 22 nmod:poss 22:nmod:poss _ -22 driver driver NOUN NN Number=Sing 19 obl 19:obl:with _ -23 in in ADP IN _ 26 case 26:case _ -24 the the DET DT Definite=Def|PronType=Art 26 det 26:det _ -25 Salikh Salikh PROPN NNP Number=Sing 26 compound 26:compound _ -26 district district NOUN NN Number=Sing 19 obl 19:obl:in SpaceAfter=No -27 . . PUNCT . _ 19 punct 19:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0018 -# text = Guerrillas killed an engineer, Asi Ali, from Tikrit. -1 Guerrillas guerrilla NOUN NNS Number=Plur 2 nsubj 2:nsubj _ -2 killed kill VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -3 an a DET DT Definite=Ind|PronType=Art 4 det 4:det _ -4 engineer engineer NOUN NN Number=Sing 2 obj 2:obj SpaceAfter=No -5 , , PUNCT , _ 4 punct 4:punct _ -6 Asi Asi PROPN NNP Number=Sing 4 appos 4:appos _ -7 Ali Ali PROPN NNP Number=Sing 6 flat 6:flat SpaceAfter=No -8 , , PUNCT , _ 4 punct 4:punct _ -9 from from ADP IN _ 10 case 10:case _ -10 Tikrit Tikrit PROPN NNP Number=Sing 4 nmod 4:nmod:from SpaceAfter=No -11 . . PUNCT . _ 2 punct 2:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0019 -# text = They also killed Shaikh Hamid 'Akkab, a clan elder of a branch of the Dulaim tribe in Tikrit. -1 They they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 3 nsubj 3:nsubj _ -2 also also ADV RB _ 3 advmod 3:advmod _ -3 killed kill VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -4 Shaikh Shaikh PROPN NNP Number=Sing 3 obj 3:obj _ -5 Hamid Hamid PROPN NNP Number=Sing 4 flat 4:flat _ -6 'Akkab 'Akkab PROPN NNP Number=Sing 4 flat 4:flat SpaceAfter=No -7 , , PUNCT , _ 4 punct 4:punct _ -8 a a DET DT Definite=Ind|PronType=Art 10 det 10:det _ -9 clan clan NOUN NN Number=Sing 10 compound 10:compound _ -10 elder elder NOUN NN Number=Sing 4 appos 4:appos _ -11 of of ADP IN _ 13 case 13:case _ -12 a a DET DT Definite=Ind|PronType=Art 13 det 13:det _ -13 branch branch NOUN NN Number=Sing 10 nmod 10:nmod:of _ -14 of of ADP IN _ 17 case 17:case _ -15 the the DET DT Definite=Def|PronType=Art 17 det 17:det _ -16 Dulaim Dulaim PROPN NNP Number=Sing 17 compound 17:compound _ -17 tribe tribe NOUN NN Number=Sing 13 nmod 13:nmod:of _ -18 in in ADP IN _ 19 case 19:case _ -19 Tikrit Tikrit PROPN NNP Number=Sing 3 obl 3:obl:in SpaceAfter=No -20 . . PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0020 -# text = His mother was also killed in the attack. -1 His he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 2 nmod:poss 2:nmod:poss _ -2 mother mother NOUN NN Number=Sing 5 nsubj:pass 5:nsubj:pass _ -3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 5 aux:pass 5:aux:pass _ -4 also also ADV RB _ 5 advmod 5:advmod _ -5 killed kill VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ -6 in in ADP IN _ 8 case 8:case _ -7 the the DET DT Definite=Def|PronType=Art 8 det 8:det _ -8 attack attack NOUN NN Number=Sing 5 obl 5:obl:in SpaceAfter=No -9 . . PUNCT . _ 5 punct 5:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0021 -# text = Two other Dulaim leaders have been killed in the past week and a half. -1 Two two NUM CD NumType=Card 4 nummod 4:nummod _ -2 other other ADJ JJ Degree=Pos 4 amod 4:amod _ -3 Dulaim Dulaim PROPN NNP Number=Sing 4 compound 4:compound _ -4 leaders leader NOUN NNS Number=Plur 7 nsubj:pass 7:nsubj:pass _ -5 have have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 7 aux 7:aux _ -6 been be AUX VBN Tense=Past|VerbForm=Part 7 aux:pass 7:aux:pass _ -7 killed kill VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ -8 in in ADP IN _ 11 case 11:case _ -9 the the DET DT Definite=Def|PronType=Art 11 det 11:det _ -10 past past ADJ JJ Degree=Pos 11 amod 11:amod _ -11 week week NOUN NN Number=Sing 7 obl 7:obl:in _ -12 and and CCONJ CC _ 14 cc 14:cc _ -13 a a DET DT Definite=Ind|PronType=Art 14 det 14:det _ -14 half half NOUN NN Number=Sing 11 nummod 11:nummod SpaceAfter=No -15 . . PUNCT . _ 7 punct 7:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0022 -# text = Guerrillas near Hawijah launched an attack that left 6 dead, including 4 Iraqi soldiers. -1 Guerrillas guerrilla NOUN NNS Number=Plur 4 nsubj 4:nsubj _ -2 near near ADP IN _ 3 case 3:case _ -3 Hawijah Hawijah PROPN NNP Number=Sing 1 nmod 1:nmod:near _ -4 launched launch VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -5 an a DET DT Definite=Ind|PronType=Art 6 det 6:det _ -6 attack attack NOUN NN Number=Sing 4 obj 4:obj|8:nsubj _ -7 that that PRON WDT PronType=Rel 8 nsubj 6:ref _ -8 left leave VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 6 acl:relcl 6:acl:relcl _ -9 6 6 NUM CD NumType=Card 8 obj 8:obj|10:nsubj:xsubj _ -10 dead dead ADJ JJ Degree=Pos 8 xcomp 8:xcomp SpaceAfter=No -11 , , PUNCT , _ 8 punct 8:punct _ -12 including include VERB VBG VerbForm=Ger 15 case 15:case _ -13 4 4 NUM CD NumType=Card 15 nummod 15:nummod _ -14 Iraqi iraqi ADJ JJ Degree=Pos 15 amod 15:amod _ -15 soldiers soldier NOUN NNS Number=Plur 9 nmod 9:nmod:include SpaceAfter=No -16 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0023 -# text = One of them was from the Jubur tribe and was deputy commander of the Hawijah garrison. -1 One one NUM CD NumType=Card 7 nsubj 7:nsubj|12:nsubj _ -2 of of ADP IN _ 3 case 3:case _ -3 them they PRON PRP Case=Acc|Number=Plur|Person=3|PronType=Prs 1 nmod 1:nmod:of _ -4 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 7 cop 7:cop _ -5 from from ADP IN _ 7 case 7:case _ -6 the the DET DT Definite=Def|PronType=Art 7 det 7:det _ -7 Jubur Jubur PROPN NNP Number=Sing 0 root 0:root _ -8 tribe tribe NOUN NN Number=Sing 7 flat 7:flat _ -9 and and CCONJ CC _ 12 cc 12:cc _ -10 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 12 cop 12:cop _ -11 deputy deputy NOUN NN Number=Sing 12 compound 12:compound _ -12 commander commander NOUN NN Number=Sing 7 conj 7:conj:and _ -13 of of ADP IN _ 16 case 16:case _ -14 the the DET DT Definite=Def|PronType=Art 16 det 16:det _ -15 Hawijah Hawijah PROPN NNP Number=Sing 16 compound 16:compound _ -16 garrison garrison NOUN NN Number=Sing 12 nmod 12:nmod:of SpaceAfter=No -17 . . PUNCT . _ 7 punct 7:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0024 -# text = Two hundred members of the Batawi clan of the Dulaim demonstrated in Baghdad on Friday, protesting the killing of their clan elder, Shaikh Kadhim Sarhid and 4 of his sons, by gunmen wearing Iraqi army uniforms. -1 Two two NUM CD NumType=Card 2 compound 2:compound _ -2 hundred hundred NUM CD NumType=Card 3 nummod 3:nummod _ -3 members member NOUN NNS Number=Plur 11 nsubj 11:nsubj _ -4 of of ADP IN _ 6 case 6:case _ -5 the the DET DT Definite=Def|PronType=Art 6 det 6:det _ -6 Batawi Batawi PROPN NNP Number=Sing 3 nmod 3:nmod:of _ -7 clan clan NOUN NN Number=Sing 6 flat 6:flat _ -8 of of ADP IN _ 10 case 10:case _ -9 the the DET DT Definite=Def|PronType=Art 10 det 10:det _ -10 Dulaim Dulaim PROPN NNP Number=Sing 6 nmod 6:nmod:of _ -11 demonstrated demonstrate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -12 in in ADP IN _ 13 case 13:case _ -13 Baghdad Baghdad PROPN NNP Number=Sing 11 obl 11:obl:in _ -14 on on ADP IN _ 15 case 15:case _ -15 Friday Friday PROPN NNP Number=Sing 11 obl 11:obl:on SpaceAfter=No -16 , , PUNCT , _ 11 punct 11:punct _ -17 protesting protest VERB VBG VerbForm=Ger 11 advcl 11:advcl _ -18 the the DET DT Definite=Def|PronType=Art 19 det 19:det _ -19 killing killing NOUN NN Number=Sing 17 obj 17:obj _ -20 of of ADP IN _ 23 case 23:case _ -21 their they PRON PRP$ Number=Plur|Person=3|Poss=Yes|PronType=Prs 23 nmod:poss 23:nmod:poss _ -22 clan clan NOUN NN Number=Sing 23 compound 23:compound _ -23 elder elder NOUN NN Number=Sing 19 nmod 19:nmod:of SpaceAfter=No -24 , , PUNCT , _ 23 punct 23:punct _ -25 Shaikh Shaikh PROPN NNP Number=Sing 23 appos 23:appos _ -26 Kadhim Kadhim PROPN NNP Number=Sing 25 flat 25:flat _ -27 Sarhid Sarhid PROPN NNP Number=Sing 25 flat 25:flat _ -28 and and CCONJ CC _ 29 cc 29:cc _ -29 4 4 NUM CD NumType=Card 23 conj 19:nmod:of|23:conj:and _ -30 of of ADP IN _ 32 case 32:case _ -31 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 32 nmod:poss 32:nmod:poss _ -32 sons son NOUN NNS Number=Plur 29 nmod 29:nmod:of SpaceAfter=No -33 , , PUNCT , _ 19 punct 19:punct _ -34 by by ADP IN _ 35 case 35:case _ -35 gunmen gunman NOUN NNS Number=Plur 19 nmod 19:nmod:by _ -36 wearing wear VERB VBG VerbForm=Ger 35 acl 35:acl _ -37 Iraqi iraqi ADJ JJ Degree=Pos 39 amod 39:amod _ -38 army army NOUN NN Number=Sing 39 compound 39:compound _ -39 uniforms uniform NOUN NNS Number=Plur 36 obj 36:obj SpaceAfter=No -40 . . PUNCT . _ 11 punct 11:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0025 -# text = (This is a largely Sunni Arab clan, and some Sunni observers have accused Shiite elements in the government of being behind the assassination; it is more likely the work of Sunni Arab guerrillas punishing the Batawi leaders for cooperating with the Dec. 15 elections.) -1 ( ( PUNCT -LRB- _ 8 punct 8:punct SpaceAfter=No -2 This this PRON DT Number=Sing|PronType=Dem 8 nsubj 8:nsubj _ -3 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 8 cop 8:cop _ -4 a a DET DT Definite=Ind|PronType=Art 8 det 8:det _ -5 largely largely ADV RB _ 8 advmod 8:advmod _ -6 Sunni sunni ADJ JJ Degree=Pos 8 amod 8:amod _ -7 Arab arab ADJ JJ Degree=Pos 8 amod 8:amod _ -8 clan clan NOUN NN Number=Sing 0 root 0:root SpaceAfter=No -9 , , PUNCT , _ 15 punct 15:punct _ -10 and and CCONJ CC _ 15 cc 15:cc _ -11 some some DET DT _ 13 det 13:det _ -12 Sunni sunni ADJ JJ Degree=Pos 13 amod 13:amod _ -13 observers observer NOUN NNS Number=Plur 15 nsubj 15:nsubj _ -14 have have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 15 aux 15:aux _ -15 accused accuse VERB VBN Tense=Past|VerbForm=Part 8 conj 8:conj:and _ -16 Shiite shiite ADJ JJ Degree=Pos 17 amod 17:amod _ -17 elements element NOUN NNS Number=Plur 15 obj 15:obj _ -18 in in ADP IN _ 20 case 20:case _ -19 the the DET DT Definite=Def|PronType=Art 20 det 20:det _ -20 government government NOUN NN Number=Sing 17 nmod 17:nmod:in _ -21 of of SCONJ IN _ 25 mark 25:mark _ -22 being be AUX VBG VerbForm=Ger 25 cop 25:cop _ -23 behind behind ADP IN _ 25 case 25:case _ -24 the the DET DT Definite=Def|PronType=Art 25 det 25:det _ -25 assassination assassination NOUN NN Number=Sing 15 advcl 15:advcl:behind SpaceAfter=No -26 ; ; PUNCT , _ 8 punct 8:punct _ -27 it it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 32 nsubj 32:nsubj _ -28 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 32 cop 32:cop _ -29 more more ADV RBR _ 30 advmod 30:advmod _ -30 likely likely ADV RB _ 32 advmod 32:advmod _ -31 the the DET DT Definite=Def|PronType=Art 32 det 32:det _ -32 work work NOUN NN Number=Sing 8 parataxis 8:parataxis _ -33 of of ADP IN _ 36 case 36:case _ -34 Sunni sunni ADJ JJ Degree=Pos 36 amod 36:amod _ -35 Arab arab ADJ JJ Degree=Pos 36 amod 36:amod _ -36 guerrillas guerrilla NOUN NNS Number=Plur 32 nmod 32:nmod:of _ -37 punishing punish VERB VBG VerbForm=Ger 36 acl 36:acl _ -38 the the DET DT Definite=Def|PronType=Art 39 det 39:det _ -39 Batawi Batawi PROPN NNP Number=Sing 37 obj 37:obj _ -40 leaders leader NOUN NNS Number=Plur 39 flat 39:flat _ -41 for for SCONJ IN _ 42 mark 42:mark _ -42 cooperating cooperate VERB VBG VerbForm=Ger 39 acl 39:acl:for _ -43 with with ADP IN _ 47 case 47:case _ -44 the the DET DT Definite=Def|PronType=Art 47 det 47:det _ -45 Dec. Dec. PROPN NNP Number=Sing 47 compound 47:compound _ -46 15 15 NUM CD NumType=Card 45 nummod 45:nummod _ -47 elections election NOUN NNS Number=Plur 42 obl 42:obl:with SpaceAfter=No -48 . . PUNCT . _ 8 punct 8:punct SpaceAfter=No -49 ) ) PUNCT -RRB- _ 8 punct 8:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0026 -# text = Al-Zaman : The Iraqi High Electoral Commission on Friday denied a request of the Debaathification Commission to exclude 51 individuals from running on party lists in the Dec. 15 elections on grounds of having been sufficiently involved in Baath activities to warrant their being excluded from civil office. -1 Al Al PROPN NNP Number=Sing 0 root 0:root SpaceAfter=No -2 - - PUNCT HYPH _ 1 punct 1:punct SpaceAfter=No -3 Zaman Zaman PROPN NNP Number=Sing 1 flat 1:flat _ -4 : : PUNCT : _ 1 punct 1:punct _ -5 The the DET DT Definite=Def|PronType=Art 9 det 9:det _ -6 Iraqi iraqi ADJ JJ Degree=Pos 9 amod 9:amod _ -7 High High PROPN NNP Number=Sing 9 compound 9:compound _ -8 Electoral Electoral PROPN NNP Number=Sing 9 compound 9:compound _ -9 Commission Commission PROPN NNP Number=Sing 12 nsubj 12:nsubj _ -10 on on ADP IN _ 11 case 11:case _ -11 Friday Friday PROPN NNP Number=Sing 12 obl 12:obl:on _ -12 denied deny VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 parataxis 1:parataxis _ -13 a a DET DT Definite=Ind|PronType=Art 14 det 14:det _ -14 request request NOUN NN Number=Sing 12 obj 12:obj _ -15 of of ADP IN _ 18 case 18:case _ -16 the the DET DT Definite=Def|PronType=Art 18 det 18:det _ -17 Debaathification Debaathification PROPN NNP Number=Sing 18 compound 18:compound _ -18 Commission Commission PROPN NNP Number=Sing 14 nmod 14:nmod:of _ -19 to to PART TO _ 20 mark 20:mark _ -20 exclude exclude VERB VB VerbForm=Inf 14 acl 14:acl:to _ -21 51 51 NUM CD NumType=Card 22 nummod 22:nummod _ -22 individuals individual NOUN NNS Number=Plur 20 obj 20:obj _ -23 from from SCONJ IN _ 24 mark 24:mark _ -24 running run VERB VBG VerbForm=Ger 20 advcl 20:advcl:from _ -25 on on ADP IN _ 27 case 27:case _ -26 party party NOUN NN Number=Sing 27 compound 27:compound _ -27 lists list NOUN NNS Number=Plur 24 obl 24:obl:on _ -28 in in ADP IN _ 32 case 32:case _ -29 the the DET DT Definite=Def|PronType=Art 32 det 32:det _ -30 Dec. Dec. PROPN NNP Number=Sing 32 compound 32:compound _ -31 15 15 NUM CD NumType=Card 30 nummod 30:nummod _ -32 elections election NOUN NNS Number=Plur 27 nmod 27:nmod:in _ -33 on on ADP IN _ 34 case 34:case _ -34 grounds grounds NOUN NNS Number=Plur 24 obl 24:obl:on _ -35 of of SCONJ IN _ 39 mark 39:mark _ -36 having have AUX VBG VerbForm=Ger 39 aux 39:aux _ -37 been be AUX VBN Tense=Past|VerbForm=Part 39 cop 39:cop _ -38 sufficiently sufficiently ADV RB _ 39 advmod 39:advmod _ -39 involved involved ADJ JJ Degree=Pos 34 acl 34:acl:of _ -40 in in ADP IN _ 42 case 42:case _ -41 Baath Baath PROPN NNP Number=Sing 42 compound 42:compound _ -42 activities activity NOUN NNS Number=Plur 39 obl 39:obl:in _ -43 to to PART TO _ 44 mark 44:mark _ -44 warrant warrant VERB VB VerbForm=Inf 38 xcomp 38:xcomp _ -45 their they PRON PRP$ Number=Plur|Person=3|Poss=Yes|PronType=Prs 47 nsubj:pass 47:nsubj:pass _ -46 being be AUX VBG VerbForm=Ger 47 aux:pass 47:aux:pass _ -47 excluded exclude VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 44 ccomp 44:ccomp _ -48 from from ADP IN _ 50 case 50:case _ -49 civil civil ADJ JJ Degree=Pos 50 amod 50:amod _ -50 office office NOUN NN Number=Sing 47 obl 47:obl:from SpaceAfter=No -51 . . PUNCT . _ 1 punct 1:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0027 -# text = The Commission said it had no legal grounds for such an exclusion. -1 The the DET DT Definite=Def|PronType=Art 2 det 2:det _ -2 Commission commission NOUN NN Number=Sing 3 nsubj 3:nsubj _ -3 said say VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -4 it it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 5 nsubj 5:nsubj _ -5 had have VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 3 ccomp 3:ccomp _ -6 no no DET DT _ 8 det 8:det _ -7 legal legal ADJ JJ Degree=Pos 8 amod 8:amod _ -8 grounds grounds NOUN NNS Number=Plur 5 obj 5:obj _ -9 for for ADP IN _ 12 case 12:case _ -10 such such DET PDT _ 12 det:predet 12:det:predet _ -11 an a DET DT Definite=Ind|PronType=Art 12 det 12:det _ -12 exclusion exclusion NOUN NN Number=Sing 8 nmod 8:nmod:for SpaceAfter=No -13 . . PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0028 -# text = This item is a small one and easily missed. -1 This this DET DT Number=Sing|PronType=Dem 2 det 2:det _ -2 item item NOUN NN Number=Sing 6 nsubj 6:nsubj|9:nsubj _ -3 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 cop 6:cop _ -4 a a DET DT Definite=Ind|PronType=Art 6 det 6:det _ -5 small small ADJ JJ Degree=Pos 6 amod 6:amod _ -6 one one NOUN NN Number=Sing 0 root 0:root _ -7 and and CCONJ CC _ 9 cc 9:cc _ -8 easily easily ADV RB _ 9 advmod 9:advmod _ -9 missed miss VERB VBN Tense=Past|VerbForm=Part 6 conj 6:conj:and SpaceAfter=No -10 . . PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0029 -# text = But in my view it is highly significant. -1 But but CCONJ CC _ 8 cc 8:cc _ -2 in in ADP IN _ 4 case 4:case _ -3 my my PRON PRP$ Number=Sing|Person=1|Poss=Yes|PronType=Prs 4 nmod:poss 4:nmod:poss _ -4 view view NOUN NN Number=Sing 8 obl 8:obl:in _ -5 it it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 8 nsubj 8:nsubj _ -6 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 8 cop 8:cop _ -7 highly highly ADV RB _ 8 advmod 8:advmod _ -8 significant significant ADJ JJ Degree=Pos 0 root 0:root SpaceAfter=No -9 . . PUNCT . _ 8 punct 8:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0030 -# text = The Debaathification Commission had been pushed by Ahmad Chalabi and his Iraqi National Congress very hard, and had pushed many Sunni Arabs into the arms of the guerrillas. -1 The the DET DT Definite=Def|PronType=Art 3 det 3:det _ -2 Debaathification Debaathification PROPN NNP Number=Sing 3 compound 3:compound _ -3 Commission Commission PROPN NNP Number=Sing 6 nsubj:pass 6:nsubj:pass|20:nsubj:pass _ -4 had have AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 6 aux 6:aux _ -5 been be AUX VBN Tense=Past|VerbForm=Part 6 aux:pass 6:aux:pass _ -6 pushed push VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ -7 by by ADP IN _ 8 case 8:case _ -8 Ahmad Ahmad PROPN NNP Number=Sing 6 obl 6:obl:by _ -9 Chalabi Chalabi PROPN NNP Number=Sing 8 flat 8:flat _ -10 and and CCONJ CC _ 14 cc 14:cc _ -11 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 14 nmod:poss 14:nmod:poss _ -12 Iraqi Iraqi PROPN NNP Number=Sing 14 compound 14:compound _ -13 National National PROPN NNP Number=Sing 14 compound 14:compound _ -14 Congress Congress PROPN NNP Number=Sing 8 conj 6:obl:by|8:conj:and _ -15 very very ADV RB _ 16 advmod 16:advmod _ -16 hard hard ADV RB Degree=Pos 6 advmod 6:advmod SpaceAfter=No -17 , , PUNCT , _ 20 punct 20:punct _ -18 and and CCONJ CC _ 20 cc 20:cc _ -19 had have AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 20 aux 20:aux _ -20 pushed push VERB VBN Tense=Past|VerbForm=Part 6 conj 6:conj:and _ -21 many many ADJ JJ Degree=Pos 23 amod 23:amod _ -22 Sunni sunni ADJ JJ Degree=Pos 23 amod 23:amod _ -23 Arabs Arabs PROPN NNPS Number=Plur 20 obj 20:obj _ -24 into into ADP IN _ 26 case 26:case _ -25 the the DET DT Definite=Def|PronType=Art 26 det 26:det _ -26 arms arm NOUN NNS Number=Plur 20 obl 20:obl:into _ -27 of of ADP IN _ 29 case 29:case _ -28 the the DET DT Definite=Def|PronType=Art 29 det 29:det _ -29 guerrillas guerrilla NOUN NNS Number=Plur 26 nmod 26:nmod:of SpaceAfter=No -30 . . PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0031 -# text = Chalabi has been increasingly marginalized within Iraq, however, despite his ties of clientelage with Washington and Tehran. -1 Chalabi Chalabi PROPN NNP Number=Sing 5 nsubj:pass 5:nsubj:pass _ -2 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 5 aux 5:aux _ -3 been be AUX VBN Tense=Past|VerbForm=Part 5 aux:pass 5:aux:pass _ -4 increasingly increasingly ADV RB _ 5 advmod 5:advmod _ -5 marginalized marginalize VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ -6 within within ADP IN _ 7 case 7:case _ -7 Iraq Iraq PROPN NNP Number=Sing 5 obl 5:obl:within SpaceAfter=No -8 , , PUNCT , _ 5 punct 5:punct _ -9 however however ADV RB _ 5 advmod 5:advmod SpaceAfter=No -10 , , PUNCT , _ 5 punct 5:punct _ -11 despite despite ADP IN _ 13 case 13:case _ -12 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 13 nmod:poss 13:nmod:poss _ -13 ties tie NOUN NNS Number=Plur 5 obl 5:obl:despite _ -14 of of ADP IN _ 15 case 15:case _ -15 clientelage clientelage NOUN NN Number=Sing 13 nmod 13:nmod:of _ -16 with with ADP IN _ 17 case 17:case _ -17 Washington Washington PROPN NNP Number=Sing 15 nmod 15:nmod:with _ -18 and and CCONJ CC _ 19 cc 19:cc _ -19 Tehran Tehran PROPN NNP Number=Sing 17 conj 15:nmod:with|17:conj:and SpaceAfter=No -20 . . PUNCT . _ 5 punct 5:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0032 -# text = He is no longer in the dominant Shiite list, the United Iraqi Alliance, and won't have many seats in the new parliament. -1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 9 nsubj 9:nsubj|19:nsubj _ -2 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 9 cop 9:cop _ -3 no no ADV RB _ 4 advmod 4:advmod _ -4 longer longer ADV RBR Degree=Cmp 7 advmod 7:advmod _ -5 in in ADP IN _ 9 case 9:case _ -6 the the DET DT Definite=Def|PronType=Art 9 det 9:det _ -7 dominant dominant ADJ JJ Degree=Pos 9 amod 9:amod _ -8 Shiite shiite ADJ JJ Degree=Pos 9 amod 9:amod _ -9 list list NOUN NN Number=Sing 0 root 0:root SpaceAfter=No -10 , , PUNCT , _ 9 punct 9:punct _ -11 the the DET DT Definite=Def|PronType=Art 14 det 14:det _ -12 United United PROPN NNP Number=Sing 14 compound 14:compound _ -13 Iraqi Iraqi PROPN NNP Number=Sing 14 compound 14:compound _ -14 Alliance Alliance PROPN NNP Number=Sing 9 appos 9:appos SpaceAfter=No -15 , , PUNCT , _ 19 punct 19:punct _ -16 and and CCONJ CC _ 19 cc 19:cc _ -17 wo will AUX MD VerbForm=Fin 19 aux 19:aux SpaceAfter=No -18 n't not PART RB _ 19 advmod 19:advmod _ -19 have have VERB VB VerbForm=Inf 9 conj 9:conj:and _ -20 many many ADJ JJ Degree=Pos 21 amod 21:amod _ -21 seats seat NOUN NNS Number=Plur 19 obj 19:obj _ -22 in in ADP IN _ 25 case 25:case _ -23 the the DET DT Definite=Def|PronType=Art 25 det 25:det _ -24 new new ADJ JJ Degree=Pos 25 amod 25:amod _ -25 parliament parliament NOUN NN Number=Sing 21 nmod 21:nmod:in SpaceAfter=No -26 . . PUNCT . _ 9 punct 9:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0033 -# text = Some 2,000 junior officers of the old Baath army have been recalled to duty in recent months, something Chalabi would have blocked if he could have. -1 Some some DET DT _ 4 det 4:det _ -2 2,000 2,000 NUM CD NumType=Card 4 nummod 4:nummod _ -3 junior junior ADJ JJ Degree=Pos 4 amod 4:amod _ -4 officers officer NOUN NNS Number=Plur 12 nsubj:pass 12:nsubj:pass _ -5 of of ADP IN _ 9 case 9:case _ -6 the the DET DT Definite=Def|PronType=Art 9 det 9:det _ -7 old old ADJ JJ Degree=Pos 9 amod 9:amod _ -8 Baath Baath PROPN NNP Number=Sing 9 compound 9:compound _ -9 army army NOUN NN Number=Sing 4 nmod 4:nmod:of _ -10 have have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 12 aux 12:aux _ -11 been be AUX VBN Tense=Past|VerbForm=Part 12 aux:pass 12:aux:pass _ -12 recalled recall VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ -13 to to ADP IN _ 14 case 14:case _ -14 duty duty NOUN NN Number=Sing 12 obl 12:obl:to _ -15 in in ADP IN _ 17 case 17:case _ -16 recent recent ADJ JJ Degree=Pos 17 amod 17:amod _ -17 months month NOUN NNS Number=Plur 12 obl 12:obl:in SpaceAfter=No -18 , , PUNCT , _ 12 punct 12:punct _ -19 something something PRON NN Number=Sing 12 obl:npmod 12:obl:npmod _ -20 Chalabi Chalabi PROPN NNP Number=Sing 23 nsubj 23:nsubj _ -21 would would AUX MD VerbForm=Fin 23 aux 23:aux _ -22 have have AUX VB VerbForm=Inf 23 aux 23:aux _ -23 blocked block VERB VBN Tense=Past|VerbForm=Part 19 acl:relcl 19:acl:relcl _ -24 if if SCONJ IN _ 27 mark 27:mark _ -25 he he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 27 nsubj 27:nsubj _ -26 could could AUX MD VerbForm=Fin 27 aux 27:aux _ -27 have have VERB VB VerbForm=Inf 23 advcl 23:advcl:if SpaceAfter=No -28 . . PUNCT . _ 12 punct 12:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0034 -# text = Now the Electoral Commission is refusing to punish people for mere past Baath Party membership. -1 Now now ADV RB _ 6 advmod 6:advmod _ -2 the the DET DT Definite=Def|PronType=Art 4 det 4:det _ -3 Electoral Electoral PROPN NNP Number=Sing 4 compound 4:compound _ -4 Commission Commission PROPN NNP Number=Sing 6 nsubj 6:nsubj|8:nsubj:xsubj _ -5 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 aux 6:aux _ -6 refusing refuse VERB VBG Tense=Pres|VerbForm=Part 0 root 0:root _ -7 to to PART TO _ 8 mark 8:mark _ -8 punish punish VERB VB VerbForm=Inf 6 xcomp 6:xcomp _ -9 people people NOUN NNS Number=Plur 8 obj 8:obj _ -10 for for ADP IN _ 15 case 15:case _ -11 mere mere ADJ JJ Degree=Pos 15 amod 15:amod _ -12 past past ADJ JJ Degree=Pos 15 amod 15:amod _ -13 Baath Baath PROPN NNP Number=Sing 14 compound 14:compound _ -14 Party Party PROPN NNP Number=Sing 15 compound 15:compound _ -15 membership membership NOUN NN Number=Sing 8 obl 8:obl:for SpaceAfter=No -16 . . PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0035 -# text = The situation in Iraq is only going to get better this way. -1 The the DET DT Definite=Def|PronType=Art 2 det 2:det _ -2 situation situation NOUN NN Number=Sing 7 nsubj 7:nsubj|9:nsubj:xsubj|10:nsubj:xsubj _ -3 in in ADP IN _ 4 case 4:case _ -4 Iraq Iraq PROPN NNP Number=Sing 2 nmod 2:nmod:in _ -5 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 7 aux 7:aux _ -6 only only ADV RB _ 7 advmod 7:advmod _ -7 going go VERB VBG Tense=Pres|VerbForm=Part 0 root 0:root _ -8 to to PART TO _ 9 mark 9:mark _ -9 get get VERB VB VerbForm=Inf 7 xcomp 7:xcomp _ -10 better better ADJ JJR Degree=Cmp 9 xcomp 9:xcomp _ -11 this this DET DT Number=Sing|PronType=Dem 12 det 12:det _ -12 way way NOUN NN Number=Sing 10 obj 10:obj SpaceAfter=No -13 . . PUNCT . _ 7 punct 7:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0036 -# text = If someone committed a crime against humanity, prosecute the person. -1 If if SCONJ IN _ 3 mark 3:mark _ -2 someone someone PRON NN Number=Sing 3 nsubj 3:nsubj _ -3 committed commit VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 9 advcl 9:advcl:if _ -4 a a DET DT Definite=Ind|PronType=Art 5 det 5:det _ -5 crime crime NOUN NN Number=Sing 3 obj 3:obj _ -6 against against ADP IN _ 7 case 7:case _ -7 humanity humanity NOUN NN Number=Sing 5 nmod 5:nmod:against SpaceAfter=No -8 , , PUNCT , _ 9 punct 9:punct _ -9 prosecute prosecute VERB VB VerbForm=Inf 0 root 0:root _ -10 the the DET DT Definite=Def|PronType=Art 11 det 11:det _ -11 person person NOUN NN Number=Sing 9 obj 9:obj SpaceAfter=No -12 . . PUNCT . _ 9 punct 9:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0037 -# text = If he or she did not, then they should have all the same rights as other Iraqis. -1 If if SCONJ IN _ 5 mark 5:mark _ -2 he he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 5 nsubj 5:nsubj _ -3 or or CCONJ CC _ 4 cc 4:cc _ -4 she she PRON PRP Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs 2 conj 2:conj:or|5:nsubj _ -5 did do VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 11 advcl 11:advcl:if _ -6 not not PART RB _ 5 advmod 5:advmod SpaceAfter=No -7 , , PUNCT , _ 11 punct 11:punct _ -8 then then ADV RB PronType=Dem 11 advmod 11:advmod _ -9 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 11 nsubj 11:nsubj _ -10 should should AUX MD VerbForm=Fin 11 aux 11:aux _ -11 have have VERB VB VerbForm=Inf 0 root 0:root _ -12 all all DET PDT _ 15 det:predet 15:det:predet _ -13 the the DET DT Definite=Def|PronType=Art 15 det 15:det _ -14 same same ADJ JJ Degree=Pos 15 amod 15:amod _ -15 rights rights NOUN NNS Number=Plur 11 obj 11:obj _ -16 as as ADP IN _ 18 case 18:case _ -17 other other ADJ JJ Degree=Pos 18 amod 18:amod _ -18 Iraqis Iraqis PROPN NNPS Number=Plur 15 nmod 15:nmod:as SpaceAfter=No -19 . . PUNCT . _ 11 punct 11:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0038 -# text = Al-Sharq al-Awsat reports that a key eyewitness in the trial of Saddam Hussein for a 1982 massacre at Dujail has died. -1 Al Al PROPN NNP Number=Sing 7 nsubj 7:nsubj SpaceAfter=No -2 - - PUNCT HYPH _ 1 punct 1:punct SpaceAfter=No -3 Sharq Sharq PROPN NNP Number=Sing 1 flat 1:flat _ -4 al al PROPN NNP Number=Sing 1 flat 1:flat SpaceAfter=No -5 - - PUNCT HYPH _ 1 punct 1:punct SpaceAfter=No -6 Awsat Awsat PROPN NNP Number=Sing 1 flat 1:flat _ -7 reports report VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -8 that that SCONJ IN _ 25 mark 25:mark _ -9 a a DET DT Definite=Ind|PronType=Art 11 det 11:det _ -10 key key ADJ JJ Degree=Pos 11 amod 11:amod _ -11 eyewitness eyewitness NOUN NN Number=Sing 25 nsubj 25:nsubj _ -12 in in ADP IN _ 14 case 14:case _ -13 the the DET DT Definite=Def|PronType=Art 14 det 14:det _ -14 trial trial NOUN NN Number=Sing 11 nmod 11:nmod:in _ -15 of of ADP IN _ 16 case 16:case _ -16 Saddam Saddam PROPN NNP Number=Sing 14 nmod 14:nmod:of _ -17 Hussein Hussein PROPN NNP Number=Sing 16 flat 16:flat _ -18 for for ADP IN _ 21 case 21:case _ -19 a a DET DT Definite=Ind|PronType=Art 21 det 21:det _ -20 1982 1982 NUM CD NumType=Card 21 nummod 21:nummod _ -21 massacre massacre NOUN NN Number=Sing 14 nmod 14:nmod:for _ -22 at at ADP IN _ 23 case 23:case _ -23 Dujail Dujail PROPN NNP Number=Sing 21 nmod 21:nmod:at _ -24 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 25 aux 25:aux _ -25 died die VERB VBN Tense=Past|VerbForm=Part 7 ccomp 7:ccomp SpaceAfter=No -26 . . PUNCT . _ 7 punct 7:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0039 -# text = A team from the court managed to take his deposition before he died. -1 A a DET DT Definite=Ind|PronType=Art 2 det 2:det _ -2 team team NOUN NN Number=Sing 6 nsubj 6:nsubj|8:nsubj:xsubj _ -3 from from ADP IN _ 5 case 5:case _ -4 the the DET DT Definite=Def|PronType=Art 5 det 5:det _ -5 court court NOUN NN Number=Sing 2 nmod 2:nmod:from _ -6 managed manage VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -7 to to PART TO _ 8 mark 8:mark _ -8 take take VERB VB VerbForm=Inf 6 xcomp 6:xcomp _ -9 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 10 nmod:poss 10:nmod:poss _ -10 deposition deposition NOUN NN Number=Sing 8 obj 8:obj _ -11 before before SCONJ IN _ 13 mark 13:mark _ -12 he he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 13 nsubj 13:nsubj _ -13 died die VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 8 advcl 8:advcl:before SpaceAfter=No -14 . . PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0040 -# text = The trial begins again Nov.28. -1 The the DET DT Definite=Def|PronType=Art 2 det 2:det _ -2 trial trial NOUN NN Number=Sing 3 nsubj 3:nsubj _ -3 begins begin VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -4 again again ADV RB _ 3 advmod 3:advmod _ -5 Nov. Nov. PROPN NNP Number=Sing 3 obl:tmod 3:obl:tmod SpaceAfter=No -6 28 28 NUM CD NumType=Card 5 nummod 5:nummod SpaceAfter=No -7 . . PUNCT . _ 3 punct 3:punct _ - -# newdoc id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012 -# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0001 -# text = In Baghdad the fighting still continues in several areas, mostly in Sadr city and Adhamiya. -1 In in ADP IN _ 2 case 2:case _ -2 Baghdad Baghdad PROPN NNP Number=Sing 6 obl 6:obl:in _ -3 the the DET DT Definite=Def|PronType=Art 4 det 4:det _ -4 fighting fighting NOUN NN Number=Sing 6 nsubj 6:nsubj _ -5 still still ADV RB _ 6 advmod 6:advmod _ -6 continues continue VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -7 in in ADP IN _ 9 case 9:case _ -8 several several ADJ JJ Degree=Pos 9 amod 9:amod _ -9 areas area NOUN NNS Number=Plur 6 obl 6:obl:in SpaceAfter=No -10 , , PUNCT , _ 6 punct 6:punct _ -11 mostly mostly ADV RB _ 14 advmod 14:advmod _ -12 in in ADP IN _ 14 case 14:case _ -13 Sadr Sadr PROPN NNP Number=Sing 14 compound 14:compound _ -14 city city NOUN NN Number=Sing 6 obl 6:obl:in _ -15 and and CCONJ CC _ 16 cc 16:cc _ -16 Adhamiya Adhamiya PROPN NNP Number=Sing 14 conj 6:obl:in|14:conj:and SpaceAfter=No -17 . . PUNCT . _ 6 punct 6:punct _ - -# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0002 -# text = Baghdadis don't venture much out of their neighbourhoods any more, you never know where you might get stuck. -1 Baghdadis Baghdadis PROPN NNPS Number=Plur 4 nsubj 4:nsubj _ -2 do do AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 4 aux 4:aux SpaceAfter=No -3 n't not PART RB _ 4 advmod 4:advmod _ -4 venture venture VERB VB VerbForm=Inf 0 root 0:root _ -5 much much ADV RB _ 4 advmod 4:advmod _ -6 out out ADP IN _ 9 case 9:case _ -7 of of ADP IN _ 9 case 9:case _ -8 their they PRON PRP$ Number=Plur|Person=3|Poss=Yes|PronType=Prs 9 nmod:poss 9:nmod:poss _ -9 neighbourhoods neighbourhood NOUN NNS Number=Plur 4 obl 4:obl:of _ -10 any any ADV RB _ 11 advmod 11:advmod _ -11 more more ADV RBR _ 4 advmod 4:advmod SpaceAfter=No -12 , , PUNCT , _ 4 punct 4:punct _ -13 you you PRON PRP Case=Nom|Person=2|PronType=Prs 15 nsubj 15:nsubj _ -14 never never ADV RB _ 15 advmod 15:advmod _ -15 know know VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 4 ccomp 4:ccomp _ -16 where where ADV WRB PronType=Int 20 mark 20:mark _ -17 you you PRON PRP Case=Nom|Person=2|PronType=Prs 20 nsubj:pass 20:nsubj:pass _ -18 might might AUX MD VerbForm=Fin 20 aux 20:aux _ -19 get get VERB VB VerbForm=Inf 20 aux:pass 20:aux:pass _ -20 stuck stuck ADJ JJ Degree=Pos 15 ccomp 15:ccomp SpaceAfter=No -21 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0003 -# text = There has been talk that the night curfew might be implemented again. -1 There there PRON EX _ 3 expl 3:expl _ -2 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 aux 3:aux _ -3 been be VERB VBN Tense=Past|VerbForm=Part 0 root 0:root _ -4 talk talk NOUN NN Number=Sing 3 nsubj 3:nsubj _ -5 that that SCONJ IN _ 11 mark 11:mark _ -6 the the DET DT Definite=Def|PronType=Art 8 det 8:det _ -7 night night NOUN NN Number=Sing 8 compound 8:compound _ -8 curfew curfew NOUN NN Number=Sing 11 nsubj:pass 11:nsubj:pass _ -9 might might AUX MD VerbForm=Fin 11 aux 11:aux _ -10 be be AUX VB VerbForm=Inf 11 aux:pass 11:aux:pass _ -11 implemented implement VERB VBN Tense=Past|VerbForm=Part 4 acl 4:acl:that _ -12 again again ADV RB _ 11 advmod 11:advmod SpaceAfter=No -13 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0004 -# text = My neighbourhood has been surrounded by American troops for three days now, helicopters have been circling over our heads non-stop. -1 My my PRON PRP$ Number=Sing|Person=1|Poss=Yes|PronType=Prs 2 nmod:poss 2:nmod:poss _ -2 neighbourhood neighbourhood NOUN NN Number=Sing 5 nsubj:pass 5:nsubj:pass _ -3 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 5 aux 5:aux _ -4 been be AUX VBN Tense=Past|VerbForm=Part 5 aux:pass 5:aux:pass _ -5 surrounded surround VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ -6 by by ADP IN _ 8 case 8:case _ -7 American american ADJ JJ Degree=Pos 8 amod 8:amod _ -8 troops troops NOUN NNS Number=Plur 5 obl 5:obl:by _ -9 for for ADP IN _ 11 case 11:case _ -10 three three NUM CD NumType=Card 11 nummod 11:nummod _ -11 days day NOUN NNS Number=Plur 5 obl 5:obl:for _ -12 now now ADV RB _ 5 advmod 5:advmod SpaceAfter=No -13 , , PUNCT , _ 5 punct 5:punct _ -14 helicopters helicopter NOUN NNS Number=Plur 17 nsubj 17:nsubj _ -15 have have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 17 aux 17:aux _ -16 been be AUX VBN Tense=Past|VerbForm=Part 17 aux 17:aux _ -17 circling circle VERB VBG Tense=Pres|VerbForm=Part 5 parataxis 5:parataxis _ -18 over over ADP IN _ 20 case 20:case _ -19 our we PRON PRP$ Number=Plur|Person=1|Poss=Yes|PronType=Prs 20 nmod:poss 20:nmod:poss _ -20 heads head NOUN NNS Number=Plur 17 obl 17:obl:over _ -21 non-stop non-stop ADV RB _ 17 advmod 17:advmod SpaceAfter=No -22 . . PUNCT . _ 5 punct 5:punct _ - -# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0005 -# text = Fedayeen are now visible on the street and they have become bolder than ever. -1 Fedayeen fedayeen NOUN NNS Number=Plur 4 nsubj 4:nsubj _ -2 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 4 cop 4:cop _ -3 now now ADV RB _ 4 advmod 4:advmod _ -4 visible visible ADJ JJ Degree=Pos 0 root 0:root _ -5 on on ADP IN _ 7 case 7:case _ -6 the the DET DT Definite=Def|PronType=Art 7 det 7:det _ -7 street street NOUN NN Number=Sing 4 obl 4:obl:on _ -8 and and CCONJ CC _ 11 cc 11:cc _ -9 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 11 nsubj 11:nsubj|12:nsubj:xsubj _ -10 have have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 11 aux 11:aux _ -11 become become VERB VBN Tense=Past|VerbForm=Part 4 conj 4:conj:and _ -12 bolder bolder ADJ JJR Degree=Cmp 11 xcomp 11:xcomp _ -13 than than ADP IN _ 14 case 14:case _ -14 ever ever ADV RB _ 12 obl 12:obl:than SpaceAfter=No -15 . . PUNCT . _ 4 punct 4:punct _ - -# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0006 -# text = Yesterday there were tens of them putting road blocks on our street and setting up mortars, they only come out in the open when Americans leave the area, then they start firing mortars indiscriminately and shooting their AK-47's in the air. -1 Yesterday yesterday NOUN NN Number=Sing 3 obl:tmod 3:obl:tmod _ -2 there there PRON EX _ 3 expl 3:expl _ -3 were be VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ -4 tens ten NOUN NNS Number=Plur 3 nsubj 3:nsubj _ -5 of of ADP IN _ 6 case 6:case _ -6 them they PRON PRP Case=Acc|Number=Plur|Person=3|PronType=Prs 4 nmod 4:nmod:of _ -7 putting put VERB VBG VerbForm=Ger 4 acl 4:acl _ -8 road road NOUN NN Number=Sing 9 compound 9:compound _ -9 blocks block NOUN NNS Number=Plur 7 obj 7:obj _ -10 on on ADP IN _ 12 case 12:case _ -11 our we PRON PRP$ Number=Plur|Person=1|Poss=Yes|PronType=Prs 12 nmod:poss 12:nmod:poss _ -12 street street NOUN NN Number=Sing 7 obl 7:obl:on _ -13 and and CCONJ CC _ 14 cc 14:cc _ -14 setting set VERB VBG VerbForm=Ger 7 conj 4:acl|7:conj:and _ -15 up up ADP RP _ 14 compound:prt 14:compound:prt _ -16 mortars mortar NOUN NNS Number=Plur 14 obj 14:obj SpaceAfter=No -17 , , PUNCT , _ 3 punct 3:punct _ -18 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 20 nsubj 20:nsubj _ -19 only only ADV RB _ 20 advmod 20:advmod _ -20 come come VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 3 parataxis 3:parataxis _ -21 out out ADV RB _ 20 advmod 20:advmod _ -22 in in ADP IN _ 24 case 24:case _ -23 the the DET DT Definite=Def|PronType=Art 24 det 24:det _ -24 open open ADJ JJ Degree=Pos 20 obl 20:obl:in _ -25 when when ADV WRB PronType=Int 27 mark 27:mark _ -26 Americans Americans PROPN NNPS Number=Plur 27 nsubj 27:nsubj _ -27 leave leave VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 20 advcl 20:advcl:when _ -28 the the DET DT Definite=Def|PronType=Art 29 det 29:det _ -29 area area NOUN NN Number=Sing 27 obj 27:obj SpaceAfter=No -30 , , PUNCT , _ 20 punct 20:punct _ -31 then then ADV RB PronType=Dem 33 advmod 33:advmod _ -32 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 33 nsubj 33:nsubj|34:nsubj:xsubj|38:nsubj:xsubj _ -33 start start VERB VB VerbForm=Inf 20 parataxis 20:parataxis _ -34 firing fire VERB VBG VerbForm=Ger 33 xcomp 33:xcomp _ -35 mortars mortar NOUN NNS Number=Plur 34 obj 34:obj _ -36 indiscriminately indiscriminately ADV RB _ 34 advmod 34:advmod _ -37 and and CCONJ CC _ 38 cc 38:cc _ -38 shooting shoot VERB VBG VerbForm=Ger 34 conj 33:xcomp|34:conj:and _ -39 their they PRON PRP$ Number=Plur|Person=3|Poss=Yes|PronType=Prs 42 nmod:poss 42:nmod:poss _ -40 AK ak NOUN NN Number=Sing 42 compound 42:compound SpaceAfter=No -41 - - PUNCT HYPH _ 42 punct 42:punct SpaceAfter=No -42 47's 47' NOUN NNS Number=Plur 38 obj 38:obj _ -43 in in ADP IN _ 45 case 45:case _ -44 the the DET DT Definite=Def|PronType=Art 45 det 45:det _ -45 air air NOUN NN Number=Sing 38 obl 38:obl:in SpaceAfter=No -46 . . PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0007 -# text = They are setting the road blocks at the exact same positions they were during the war last year, which indicates they are the same people. -1 They they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 3 nsubj 3:nsubj _ -2 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 3 aux 3:aux _ -3 setting set VERB VBG Tense=Pres|VerbForm=Part 0 root 0:root _ -4 the the DET DT Definite=Def|PronType=Art 6 det 6:det _ -5 road road NOUN NN Number=Sing 6 compound 6:compound _ -6 blocks block NOUN NNS Number=Plur 3 obj 3:obj _ -7 at at ADP IN _ 11 case 11:case _ -8 the the DET DT Definite=Def|PronType=Art 11 det 11:det _ -9 exact exact ADV RB _ 10 advmod 10:advmod _ -10 same same ADJ JJ Degree=Pos 11 amod 11:amod _ -11 positions position NOUN NNS Number=Plur 3 obl 3:obl:at _ -12 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 13 nsubj 13:nsubj _ -13 were be VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 11 acl:relcl 11:acl:relcl _ -14 during during ADP IN _ 16 case 16:case _ -15 the the DET DT Definite=Def|PronType=Art 16 det 16:det _ -16 war war NOUN NN Number=Sing 13 obl 13:obl:during _ -17 last last ADJ JJ Degree=Pos 18 amod 18:amod _ -18 year year NOUN NN Number=Sing 16 nmod:tmod 16:nmod:tmod SpaceAfter=No -19 , , PUNCT , _ 3 punct 3:punct _ -20 which which PRON WDT PronType=Rel 21 nsubj 21:nsubj _ -21 indicates indicate VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 advcl 3:advcl _ -22 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 26 nsubj 26:nsubj _ -23 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 26 cop 26:cop _ -24 the the DET DT Definite=Def|PronType=Art 26 det 26:det _ -25 same same ADJ JJ Degree=Pos 26 amod 26:amod _ -26 people people NOUN NNS Number=Plur 21 ccomp 21:ccomp SpaceAfter=No -27 . . PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0008 -# text = And there is nothing we can do about it really, people who are suggesting that we go out and fight them are living in dream land. -1 And and CCONJ CC _ 3 cc 3:cc _ -2 there there PRON EX _ 3 expl 3:expl _ -3 is be VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ -4 nothing nothing PRON NN Number=Sing 3 nsubj 3:nsubj _ -5 we we PRON PRP Case=Nom|Number=Plur|Person=1|PronType=Prs 7 nsubj 7:nsubj _ -6 can can AUX MD VerbForm=Fin 7 aux 7:aux _ -7 do do VERB VB VerbForm=Inf 4 acl:relcl 4:acl:relcl _ -8 about about ADP IN _ 9 case 9:case _ -9 it it PRON PRP Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs 7 obl 7:obl:about _ -10 really really ADV RB _ 7 advmod 7:advmod SpaceAfter=No -11 , , PUNCT , _ 3 punct 3:punct _ -12 people people NOUN NNS Number=Plur 24 nsubj 15:nsubj|24:nsubj _ -13 who who PRON WP PronType=Rel 15 nsubj 12:ref _ -14 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 15 aux 15:aux _ -15 suggesting suggest VERB VBG Tense=Pres|VerbForm=Part 12 acl:relcl 12:acl:relcl _ -16 that that SCONJ IN _ 18 mark 18:mark _ -17 we we PRON PRP Case=Nom|Number=Plur|Person=1|PronType=Prs 18 nsubj 18:nsubj|21:nsubj _ -18 go go VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 15 ccomp 15:ccomp _ -19 out out ADV RB _ 18 advmod 18:advmod _ -20 and and CCONJ CC _ 21 cc 21:cc _ -21 fight fight VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 18 conj 15:ccomp|18:conj:and _ -22 them they PRON PRP Case=Acc|Number=Plur|Person=3|PronType=Prs 21 obj 21:obj _ -23 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 24 aux 24:aux _ -24 living live VERB VBG Tense=Pres|VerbForm=Part 3 parataxis 3:parataxis _ -25 in in ADP IN _ 27 case 27:case _ -26 dream dream NOUN NN Number=Sing 27 compound 27:compound _ -27 land land NOUN NN Number=Sing 24 obl 24:obl:in SpaceAfter=No -28 . . PUNCT . _ 3 punct 3:punct _ - -# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0009 -# text = Even the IP and ICDC have abandoned the neighbourhood, and those are trained and armed, so don't expect scared civilians to do anything except to hide inside and pray a helicopter or a tank doesn't bomb them, and also how are American soldiers going to distinguish the brave and valiant civilians from the Fedayeen? -1 Even even ADV RB _ 3 advmod 3:advmod _ -2 the the DET DT Definite=Def|PronType=Art 3 det 3:det _ -3 IP IP PROPN NNP Number=Sing 7 nsubj 7:nsubj _ -4 and and CCONJ CC _ 5 cc 5:cc _ -5 ICDC ICDC PROPN NNP Number=Sing 3 conj 3:conj:and|7:nsubj _ -6 have have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 7 aux 7:aux _ -7 abandoned abandon VERB VBN Tense=Past|VerbForm=Part 0 root 0:root _ -8 the the DET DT Definite=Def|PronType=Art 9 det 9:det _ -9 neighbourhood neighbourhood NOUN NN Number=Sing 7 obj 7:obj SpaceAfter=No -10 , , PUNCT , _ 14 punct 14:punct _ -11 and and CCONJ CC _ 14 cc 14:cc _ -12 those those PRON DT Number=Plur|PronType=Dem 14 nsubj 14:nsubj|16:nsubj _ -13 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 14 cop 14:cop _ -14 trained trained ADJ JJ Degree=Pos 7 conj 7:conj:and _ -15 and and CCONJ CC _ 16 cc 16:cc _ -16 armed armed ADJ JJ Degree=Pos 14 conj 14:conj:and SpaceAfter=No -17 , , PUNCT , _ 7 punct 7:punct _ -18 so so ADV RB _ 21 advmod 21:advmod _ -19 do do AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 21 aux 21:aux SpaceAfter=No -20 n't not PART RB _ 21 advmod 21:advmod _ -21 expect expect VERB VB Mood=Imp|VerbForm=Fin 7 parataxis 7:parataxis _ -22 scared scared ADJ JJ Degree=Pos 23 amod 23:amod _ -23 civilians civilian NOUN NNS Number=Plur 21 obj 21:obj|25:nsubj:xsubj _ -24 to to PART TO _ 25 mark 25:mark _ -25 do do VERB VB VerbForm=Inf 21 xcomp 21:xcomp _ -26 anything anything PRON NN Number=Sing 25 obj 25:obj _ -27 except except SCONJ IN _ 29 mark 29:mark _ -28 to to PART TO _ 29 mark 29:mark _ -29 hide hide VERB VB VerbForm=Inf 26 acl 26:acl:to _ -30 inside inside ADV RB _ 29 advmod 29:advmod _ -31 and and CCONJ CC _ 32 cc 32:cc _ -32 pray pray VERB VB VerbForm=Inf 29 conj 26:acl:to|29:conj:and _ -33 a a DET DT Definite=Ind|PronType=Art 34 det 34:det _ -34 helicopter helicopter NOUN NN Number=Sing 40 nsubj 40:nsubj _ -35 or or CCONJ CC _ 37 cc 37:cc _ -36 a a DET DT Definite=Ind|PronType=Art 37 det 37:det _ -37 tank tank NOUN NN Number=Sing 34 conj 34:conj:or|40:nsubj _ -38 does do AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 40 aux 40:aux SpaceAfter=No -39 n't not PART RB _ 40 advmod 40:advmod _ -40 bomb bomb VERB VB VerbForm=Inf 32 ccomp 32:ccomp _ -41 them they PRON PRP Case=Acc|Number=Plur|Person=3|PronType=Prs 40 obj 40:obj SpaceAfter=No -42 , , PUNCT , _ 49 punct 49:punct _ -43 and and CCONJ CC _ 49 cc 49:cc _ -44 also also ADV RB _ 49 advmod 49:advmod _ -45 how how ADV WRB PronType=Int 49 advmod 49:advmod _ -46 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 49 aux 49:aux _ -47 American american ADJ JJ Degree=Pos 48 amod 48:amod _ -48 soldiers soldier NOUN NNS Number=Plur 49 nsubj 49:nsubj|51:nsubj:xsubj _ -49 going go VERB VBG Tense=Pres|VerbForm=Part 7 conj 7:conj:and _ -50 to to PART TO _ 51 mark 51:mark _ -51 distinguish distinguish VERB VB VerbForm=Inf 49 xcomp 49:xcomp _ -52 the the DET DT Definite=Def|PronType=Art 56 det 56:det _ -53 brave brave ADJ JJ Degree=Pos 56 amod 56:amod _ -54 and and CCONJ CC _ 55 cc 55:cc _ -55 valiant valiant ADJ JJ Degree=Pos 53 conj 53:conj:and|56:amod _ -56 civilians civilian NOUN NNS Number=Plur 51 obj 51:obj _ -57 from from ADP IN _ 59 case 59:case _ -58 the the DET DT Definite=Def|PronType=Art 59 det 59:det _ -59 Fedayeen fedayeen NOUN NNS Number=Plur 51 obl 51:obl:from SpaceAfter=No -60 ? ? PUNCT . _ 7 punct 7:punct _ - -# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0010 -# text = Everyone is apprehensive, there is some talk that April 9th and 10th are going to be bloody days. -1 Everyone everyone PRON NN Number=Sing 3 nsubj 3:nsubj _ -2 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 cop 3:cop _ -3 apprehensive apprehensive ADJ JJ Degree=Pos 0 root 0:root SpaceAfter=No -4 , , PUNCT , _ 3 punct 3:punct _ -5 there there PRON EX _ 6 expl 6:expl _ -6 is be VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 parataxis 3:parataxis _ -7 some some DET DT _ 8 det 8:det _ -8 talk talk NOUN NN Number=Sing 6 nsubj 6:nsubj _ -9 that that SCONJ IN _ 15 mark 15:mark _ -10 April April PROPN NNP Number=Sing 15 nsubj 15:nsubj|19:nsubj:xsubj _ -11 9th 9th NOUN NN Number=Sing 10 nummod 10:nummod _ -12 and and CCONJ CC _ 13 cc 13:cc _ -13 10th 10th NOUN NN Number=Sing 11 conj 10:nummod|11:conj:and _ -14 are be AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 15 aux 15:aux _ -15 going go VERB VBG Tense=Pres|VerbForm=Part 8 acl 8:acl:that _ -16 to to PART TO _ 19 mark 19:mark _ -17 be be AUX VB VerbForm=Inf 19 cop 19:cop _ -18 bloody bloody ADJ JJ Degree=Pos 19 amod 19:amod _ -19 days day NOUN NNS Number=Plur 15 xcomp 15:xcomp SpaceAfter=No -20 . . PUNCT . _ 3 punct 3:punct _ - diff --git a/tests/test_corpus/en_ewt-ud-train.txt b/tests/test_corpus/en_ewt-ud-train.txt deleted file mode 100644 index 1a64e9d01..000000000 --- a/tests/test_corpus/en_ewt-ud-train.txt +++ /dev/null @@ -1,77 +0,0 @@ -Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the -mosque in the town of Qaim, near the Syrian border. [This killing of a respected -cleric will be causing us trouble for years to come.] DPA: Iraqi authorities -announced that they had busted up 3 terrorist cells operating in Baghdad. Two of -them were being run by 2 officials of the Ministry of the Interior! The MoI in -Iraq is equivalent to the US FBI, so this would be like having J. Edgar Hoover -unwittingly employ at a high level members of the Weathermen bombers back in the -1960s. The third was being run by the head of an investment firm. You wonder if -he was manipulating the market with his bombing targets. The cells were -operating in the Ghazaliyah and al-Jihad districts of the capital. Although the -announcement was probably made to show progress in identifying and breaking up -terror cells, I don't find the news that the Baathists continue to penetrate the -Iraqi government very hopeful. It reminds me too much of the ARVN officers who -were secretly working for the other side in Vietnam. Al-Zaman : Guerrillas -killed a member of the Kurdistan Democratic Party after kidnapping him in Mosul. -The police commander of Ninevah Province announced that bombings had declined 80 -percent in Mosul, whereas there had been a big jump in the number of -kidnappings. On Wednesday guerrillas had kidnapped a cosmetic surgeon and his -wife while they were on their way home. In Suwayrah, Kut Province, two car bombs -were discovered before they could be detonated. (Kut is in southeastern Iraq and -has an overwhelmingly Shiite population, who are on the lookout for Baathist -saboteurs and willingly turn them in. This willingness is the main difference in -the number of bombings in the south as opposed to the center-north of the -country.) In Baghdad Kadhim Talal Husain, assistant dean at the School of -Education at Mustansiriyah University, was assassinated with his driver in the -Salikh district. Guerrillas killed an engineer, Asi Ali, from Tikrit. They also -killed Shaikh Hamid 'Akkab, a clan elder of a branch of the Dulaim tribe in -Tikrit. His mother was also killed in the attack. Two other Dulaim leaders have -been killed in the past week and a half. Guerrillas near Hawijah launched an -attack that left 6 dead, including 4 Iraqi soldiers. One of them was from the -Jubur tribe and was deputy commander of the Hawijah garrison. Two hundred -members of the Batawi clan of the Dulaim demonstrated in Baghdad on Friday, -protesting the killing of their clan elder, Shaikh Kadhim Sarhid and 4 of his -sons, by gunmen wearing Iraqi army uniforms. (This is a largely Sunni Arab clan, -and some Sunni observers have accused Shiite elements in the government of being -behind the assassination; it is more likely the work of Sunni Arab guerrillas -punishing the Batawi leaders for cooperating with the Dec. 15 elections.) -Al-Zaman : The Iraqi High Electoral Commission on Friday denied a request of the -Debaathification Commission to exclude 51 individuals from running on party -lists in the Dec. 15 elections on grounds of having been sufficiently involved -in Baath activities to warrant their being excluded from civil office. The -Commission said it had no legal grounds for such an exclusion. This item is a -small one and easily missed. But in my view it is highly significant. The -Debaathification Commission had been pushed by Ahmad Chalabi and his Iraqi -National Congress very hard, and had pushed many Sunni Arabs into the arms of -the guerrillas. Chalabi has been increasingly marginalized within Iraq, however, -despite his ties of clientelage with Washington and Tehran. He is no longer in -the dominant Shiite list, the United Iraqi Alliance, and won't have many seats -in the new parliament. Some 2,000 junior officers of the old Baath army have -been recalled to duty in recent months, something Chalabi would have blocked if -he could have. Now the Electoral Commission is refusing to punish people for -mere past Baath Party membership. The situation in Iraq is only going to get -better this way. If someone committed a crime against humanity, prosecute the -person. If he or she did not, then they should have all the same rights as other -Iraqis. Al-Sharq al-Awsat reports that a key eyewitness in the trial of Saddam -Hussein for a 1982 massacre at Dujail has died. A team from the court managed to -take his deposition before he died. The trial begins again Nov.28. - -In Baghdad the fighting still continues in several areas, mostly in Sadr city -and Adhamiya. Baghdadis don't venture much out of their neighbourhoods any more, -you never know where you might get stuck. There has been talk that the night -curfew might be implemented again. My neighbourhood has been surrounded by -American troops for three days now, helicopters have been circling over our -heads non-stop. Fedayeen are now visible on the street and they have become -bolder than ever. Yesterday there were tens of them putting road blocks on our -street and setting up mortars, they only come out in the open when Americans -leave the area, then they start firing mortars indiscriminately and shooting -their AK-47's in the air. They are setting the road blocks at the exact same -positions they were during the war last year, which indicates they are the same -people. And there is nothing we can do about it really, people who are -suggesting that we go out and fight them are living in dream land. Even the IP -and ICDC have abandoned the neighbourhood, and those are trained and armed, so -don't expect scared civilians to do anything except to hide inside and pray a -helicopter or a tank doesn't bomb them, and also how are American soldiers going -to distinguish the brave and valiant civilians from the Fedayeen? Everyone is -apprehensive, there is some talk that April 9th and 10th are going to be bloody -days.