UCREL · apmoore1 · Oct 11, 2025
diff --git a/resources/welsh/README.md b/resources/welsh/README.md
@@ -34,7 +34,8 @@ The enhanced gold standard dataset, [./enhanced_gold_standard_data.txt](./enhanc
 3. `core POS tag` - The core POS tag that is used by the USAS multilingual lexicon, this is found through the mapping of basic CorCenCC POS tags to core USAS POS tags, see the [Mapper of basic CorCenCC POS tags to core USAS POS tags section below for more details.](#mapper-of-basic-corcencc-pos-tags-to-core-usas-pos-tags)
 4. `basic POS tag` - same as before, but was called the `POS tag`
 5. `enriched POS tag` - this has come from running the [CyTag tagger](https://github.com/CorCenCC/CyTag). As this tag has been predicted it may be different to the `basic POS tag`, as the basic tag is gold standard.
-6. `USAS tag` - same as before
+6. `predicted basic POS tag` = this has come from running the [CyTag tagger](https://github.com/CorCenCC/CyTag). As this tag has been predicted it may be different to the `basic POS tag`, as the basic tag is gold standard. This is useful to have to evaluate the semantic tagger with predicted POS tags rather than gold tags which is closer to how the tagger will be used in production settings. Note that all basic POS tags that are predicted with `unk` are mapped to the basic POS tag `Gw` which is then mapped to the core USAS POS tag `xx` which stands for `unrecognized token`.
+7. `USAS tag` - same as before
 
 To re-create this dataset run the following:
 

diff --git a/resources/welsh/create_enhanced_dataset.py b/resources/welsh/create_enhanced_dataset.py
@@ -26,13 +26,15 @@ def read_gold_data(gold_data_path: Path,
     return all_sentence_data
 
 
-def read_cytag_data(cytag_data_path: Path
+def read_cytag_data(cytag_data_path: Path,
+                    basic_to_core_pos_mapper: dict[str, str]
                     ) -> list[list[dict[str, str]]]:
     '''
     :returns: A list whereby the outer list represents each sentence/line data. 
               The inner list represents each token's data. The dictionary is 
               made up of the following keys: 1. `lemma`, 2. `enhanced_pos`, 
-              3. `token`. **NOTE** the lemmas if they have a space in them e.g. 
+              3. `token` and 4. `basic_pos`.
+              **NOTE** the lemmas if they have a space in them e.g. 
               `mod rewrite` the space is replace with an underscore e.g. 
               `mod_rewrite`
     '''
@@ -49,11 +51,25 @@ def read_cytag_data(cytag_data_path: Path
             # lemma we take the first lemma and associated rich pos tag.
             token_data['lemma'] = token.get('lemma').split('|')[0].strip(' ')
             token_data['lemma'] = token_data['lemma'].replace(' ', '_')
+
             enhanced_pos = token.get('rich_pos').split('|')[0].strip(' ')
             if enhanced_pos in rich_pos_error_mapper:
                 enhanced_pos = rich_pos_error_mapper[enhanced_pos]
+
+            basic_pos = token.get('basic_pos').split('|')[0].strip(' ')
+            if basic_pos == 'unk':
+                basic_pos = 'Gw'
+            if basic_pos in rich_pos_error_mapper:
+                basic_pos = rich_pos_error_mapper[basic_pos]
+            if basic_pos not in basic_to_core_pos_mapper:
+                raise KeyError(f"The predicted basic POS tag ({basic_pos}) cannot be found in "
+                               "the mapper file of basic POS to core USAS POS "
+                               "tags.")
+
+            token_data['basic_pos'] = basic_pos
             token_data['enhanced_pos'] = enhanced_pos
             token_data['token'] = token.text.strip(' ')
+
             sentence_data.append(token_data)
         all_sentence_data.append(sentence_data)
     return all_sentence_data
@@ -72,14 +88,16 @@ def collapse_sentences_to_tokens(sentence_data: list[list[dict[str, str]]]
     return token_data
 
 
-cytag_data_path = Path('.', 'cytag_output.xml').resolve()
-cytag_data = read_cytag_data(cytag_data_path)
-
 basic_to_core_pos_mapper_path = Path('.', 'basic_cy_tags_to_core_tags.json').resolve()
 basic_to_core_pos_mapper = {}
 with basic_to_core_pos_mapper_path.open('r') as basic_to_core_data:
     basic_to_core_pos_mapper = json.load(basic_to_core_data)
 
+cytag_data_path = Path('.', 'cytag_output.xml').resolve()
+cytag_data = read_cytag_data(cytag_data_path, basic_to_core_pos_mapper)
+
+
+
 gold_data_path = Path('.', 'original_gold_standard_data.txt').resolve()
 gold_data = read_gold_data(gold_data_path, basic_to_core_pos_mapper)
 
@@ -101,6 +119,7 @@ def collapse_sentences_to_tokens(sentence_data: list[list[dict[str, str]]]
             token_string = (f'{gold_token_data["token"]}|{pred_token_data["lemma"]}|'
                             f'{token["core_pos"]}|{token["basic_pos"]}'
                             f'|{pred_token_data["enhanced_pos"]}'
+                            f'|{pred_token_data["basic_pos"]}'
                             f'|{gold_token_data["usas_tag"]}')
             enhanced_token_data.append(token_string)
             token_count += 1