Skip to content

Commit 99b6264

Browse files
authored
Merge pull request #740 from metafacture/708-html_to_text
2 parents 5e7d3cc + 95ef635 commit 99b6264

7 files changed

Lines changed: 180 additions & 0 deletions

File tree

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,18 @@ from_json("<sourceField>"[, error_string: "<errorValue>"])
826826

827827
[Java Code](https://github.com/metafacture/metafacture-core/blob/master/metafix/src/main/java/org/metafacture/metafix/method/field/FromJson.java)
828828

829+
##### `html_to_text`
830+
831+
Turns HTML text to plain text.
832+
833+
```perl
834+
html_to_text("<sourceField>")
835+
```
836+
837+
[Example in Playground](https://metafacture.org/playground/?example=html_to_text)
838+
839+
[Java Code](https://github.com/metafacture/metafacture-core/blob/master/metafix/src/main/java/org/metafacture/metafix/method/field/HtmlToText.java)
840+
829841
##### `index`
830842

831843
Returns the index position of a substring in a field and replaces the field value with this number.
@@ -931,6 +943,18 @@ lookup("path.to.field", "map-name", print_unknown: "true", destination: "unknown
931943
932944
[Java Code](https://github.com/metafacture/metafacture-core/blob/master/metafix/src/main/java/org/metafacture/metafix/method/field/Lookup.java)
933945
946+
##### `normalize_utf8`
947+
948+
Performs normalization of diacritics in UTF-8 encoded strings.
949+
950+
```perl
951+
normalize_utf8("<sourceField>")
952+
```
953+
954+
[Example in Playground](https://metafacture.org/playground/?example=normalize_utf8)
955+
956+
[Java Code](https://github.com/metafacture/metafacture-core/blob/master/metafix/src/main/java/org/metafacture/metafix/method/field/NormalizeUTF8.java)
957+
934958
##### `prepend`
935959
936960
Adds a string at the beginning of a field value.

metafacture-runner/src/main/dist/examples/gnd/gnd-and-beacons/tp2json.fix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ do put_macro("gndPersonCombinedLabel") # in contrast to morph this is not normal
1818
if exists("$[field].pre")
1919
paste("$[field].@combinedLabel", "$[field].@combinedLabel", "$[field].pre", join_char:"")
2020
end
21+
normalize_utf8("$[field].@combinedLabel")
2122
copy_field("$[field].@combinedLabel", "$[out]")
2223
end
2324

metafacture-runner/src/main/dist/examples/gnd/json/tp2json.fix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ do put_macro("gndPersonCombinedLabel") # in contrast to morph this is not normal
1212
if exists("$[field].pre")
1313
paste("$[field].@combinedLabel", "$[field].@combinedLabel", "$[field].pre", join_char:"")
1414
end
15+
normalize_utf8("$[field].@combinedLabel")
1516
copy_field("$[field].@combinedLabel", "$[out]")
1617
end
1718

metafix/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ dependencies {
1818
implementation "org.eclipse.emf:org.eclipse.emf.ecore:${versions.xtext}" // Workaround for hbz/lobid-resources#1462
1919
implementation "org.eclipse.xtext:org.eclipse.xtext.xbase:${versions.xtext}"
2020
implementation "org.eclipse.xtext:org.eclipse.xtext:${versions.xtext}"
21+
implementation "org.jsoup:jsoup:${versions.jsoup}"
2122
implementation "org.slf4j:slf4j-api:${versions.slf4j}"
2223

2324
testImplementation "com.github.tomakehurst:wiremock-jre8:${versions.wiremock}"
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/*
2+
* Copyright 2025 Tobias Bülte, hbz NRW
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.metafacture.metafix.method.field;
18+
19+
import org.metafacture.metafix.FixCommand;
20+
import org.metafacture.metafix.Metafix;
21+
import org.metafacture.metafix.Record;
22+
import org.metafacture.metafix.api.FixFunction;
23+
24+
import org.jsoup.Jsoup;
25+
26+
import java.util.List;
27+
import java.util.Map;
28+
29+
/**
30+
* Turn HTML text to plain text.
31+
*
32+
* @author Tobias Bülte (tobiasNx)
33+
*
34+
*/
35+
@FixCommand("html_to_text")
36+
public class HtmlToText implements FixFunction {
37+
38+
/**
39+
* Creates an instance of {@link HtmlToText}.
40+
*/
41+
public HtmlToText() {
42+
}
43+
44+
@Override
45+
public void apply(final Metafix metafix, final Record record, final List<String> params, final Map<String, String> options) {
46+
record.transform(params.get(0), s -> Jsoup.parse(s).wholeText());
47+
}
48+
49+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
* Copyright 2025 hbz NRW
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.metafacture.metafix.method.field;
18+
19+
import org.metafacture.metafix.FixCommand;
20+
import org.metafacture.metafix.Metafix;
21+
import org.metafacture.metafix.Record;
22+
import org.metafacture.metafix.api.FixFunction;
23+
24+
import java.text.Normalizer;
25+
import java.text.Normalizer.Form;
26+
import java.util.List;
27+
import java.util.Map;
28+
29+
/**
30+
* Performs normalization of diacritics in UTF-8 encoded strings.
31+
*
32+
* @author Tobias Bülte, hbz
33+
*/
34+
@FixCommand("normalize_utf8")
35+
public class NormalizeUTF8 implements FixFunction {
36+
37+
/**
38+
* Creates an instance of {@link NormalizeUTF8}.
39+
*/
40+
public NormalizeUTF8() {
41+
}
42+
43+
@Override
44+
public void apply(final Metafix metafix, final Record record, final List<String> params, final Map<String, String> options) {
45+
record.transform(params.get(0), s -> Normalizer.normalize(s, Form.NFC));
46+
}
47+
48+
}

metafix/src/test/java/org/metafacture/metafix/MetafixMethodTest.java

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,6 +1446,38 @@ public void shouldFlattenNestedArrayWithHashes() {
14461446
);
14471447
}
14481448

1449+
@Test
1450+
public void shouldTurnHtmlToText() {
1451+
MetafixTestHelpers.assertFix(streamReceiver, Arrays.asList(
1452+
"html_to_text('data.description')"
1453+
),
1454+
i -> {
1455+
i.startRecord("1");
1456+
i.startEntity("data");
1457+
i.literal("description", "Das Material ist im Zusammenhang mit der frei verfügbaren wortschatzdidaktischen Internetseite <a href=\"https://www.wortschatzwissen.de\">www.wortschatzwissen.de</a>");
1458+
i.endEntity();
1459+
i.endRecord();
1460+
i.startRecord("2");
1461+
i.startEntity("data");
1462+
i.literal("description", "<b>Hello World.</b><br/><p><i>Is there anyone out there?</i><p>");
1463+
i.endEntity();
1464+
i.endRecord();
1465+
},
1466+
o -> {
1467+
o.get().startRecord("1");
1468+
o.get().startEntity("data");
1469+
o.get().literal("description", "Das Material ist im Zusammenhang mit der frei verfügbaren wortschatzdidaktischen Internetseite www.wortschatzwissen.de");
1470+
o.get().endEntity();
1471+
o.get().endRecord();
1472+
o.get().startRecord("2");
1473+
o.get().startEntity("data");
1474+
o.get().literal("description", "Hello World.\nIs there anyone out there?");
1475+
o.get().endEntity();
1476+
o.get().endRecord();
1477+
}
1478+
);
1479+
}
1480+
14491481
@Test
14501482
public void shouldGetFirstIndexOfSubstring() {
14511483
MetafixTestHelpers.assertFix(streamReceiver, Arrays.asList(
@@ -1635,6 +1667,30 @@ public void shouldJoinArrayObjectField() {
16351667
);
16361668
}
16371669

1670+
@Test
1671+
public void shouldNormalizeUTF8() {
1672+
MetafixTestHelpers.assertFix(streamReceiver, Arrays.asList(
1673+
"normalize_utf8('data.title')"
1674+
),
1675+
i -> {
1676+
i.startRecord("1");
1677+
i.startEntity("data");
1678+
// The umlauts in this string are composed of two characters (u and ", e.g.):
1679+
i.literal("title", "Bauer, Sigmund: Über den Einfluß der Ackergeräthe auf den Reinertrag.");
1680+
i.endEntity();
1681+
i.endRecord();
1682+
},
1683+
o -> {
1684+
o.get().startRecord("1");
1685+
o.get().startEntity("data");
1686+
// The umlauts in this string are individual characters:
1687+
o.get().literal("title", "Bauer, Sigmund: Über den Einfluß der Ackergeräthe auf den Reinertrag.");
1688+
o.get().endEntity();
1689+
o.get().endRecord();
1690+
}
1691+
);
1692+
}
1693+
16381694
@Test
16391695
public void shouldPrependValue() {
16401696
MetafixTestHelpers.assertFix(streamReceiver, Arrays.asList(

0 commit comments

Comments
 (0)