Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
cd2f598
Merge Czech stemmer
ojwb Aug 31, 2021
b2a8e2a
Only apply do_case in R1
ojwb Aug 31, 2021
d074268
Implement the "light" version of the stemmer
ojwb Sep 1, 2021
1aabc0e
Improve comment about origin of algorithm
ojwb Sep 1, 2021
5216878
czech: Strip out unused "aggressive" code
ojwb Sep 1, 2021
f9604a5
czech: Remove -ům ending in do_case
ojwb Nov 2, 2023
1974720
Add initial version of CzechStemmerLight.java
ojwb Sep 4, 2024
73b289b
CzechStemmerLight: Fix length check for ště/šti/ští removal
ojwb Sep 5, 2024
fdf3084
Change č suffix check to če
ojwb Sep 5, 2024
d8aa06d
czech: Change -čté/-šté to -čtí/-ští
ojwb Sep 5, 2024
5809772
CzechStemmerLight: Remove one char for -es/-ém/-ím
ojwb Sep 5, 2024
9a43a32
Fix handling of possessive removal
ojwb Sep 6, 2024
87d17fa
Adjust palatalise to work like the Java version
ojwb Sep 9, 2024
7144cbe
czech: Comment out unused R1 routine for now
ojwb Sep 10, 2024
a62ed0a
czech: Don't remove -os suffix
ojwb Oct 7, 2024
7534b58
czech: Remove more suffixes
ojwb Oct 8, 2024
8a42618
czech: Remove -'{i'}mu'
ojwb Oct 8, 2024
1179ace
czech: Use a better definition of R1
ojwb Oct 8, 2024
481985e
czech: Optimise R1 check
ojwb Oct 8, 2024
634f9d1
Improve comments
ojwb Oct 8, 2024
16b06c6
czech: Use R1 instead of RV
ojwb Oct 9, 2024
a1f4451
czech: Merge two identical routines
ojwb Oct 10, 2024
6c69cf7
Update syllabic consonant comment
ojwb Jan 28, 2025
d80707e
Remove -ěm
ojwb Oct 22, 2025
e82bc6a
Palatalise -í* the same as =i
ojwb Oct 22, 2025
47494ee
czech: Adjust stringdefs to match other uses
ojwb Oct 22, 2025
02298e1
czech: Give up early for < 3 character inputs
ojwb Oct 23, 2025
55ddf27
czech: Handle -ť and -ťmi
ojwb Oct 30, 2025
8467650
czech: Handle -ec
ojwb Oct 30, 2025
96b828a
Improve palatalise of -št
ojwb Oct 30, 2025
993c17d
czech: Handle -ek
ojwb Oct 31, 2025
85da1d2
czech: Handle -eb
ojwb Oct 31, 2025
c74ea7c
czech: Handle -et
ojwb Oct 31, 2025
026c000
czech: Handle -ev
ojwb Oct 31, 2025
8ca4338
czech: Handle -eň
ojwb Nov 2, 2025
828db4d
Use stringdefs
ojwb Nov 2, 2025
4d4fb52
czech: Handle -něk
ojwb Nov 6, 2025
1fd5ab0
Don't convert -č to -k after suffix removal
ojwb Nov 7, 2025
50d974e
Don't convert -ž to -h after suffix removal
ojwb Nov 7, 2025
6f47269
Don't convert -z to -h after suffix removal
ojwb Nov 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
297 changes: 297 additions & 0 deletions CzechStemmerLight.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.StandardCharsets;

/**
* @author Dolamic Ljiljana University of Neuchatel
*
* Czech stemmer-removes case endings form nouns and adjectives, possessive adj.
* endings from names
* and takes care of palatalisation
*/
public class CzechStemmerLight {

/**
* A buffer of the current word being stemmed
*/
private StringBuffer sb=new StringBuffer();


/**
* Default constructor
*/
public CzechStemmerLight(){} // constructor

public String stem(String input){

//
input=input.toLowerCase();

//reset string buffer
sb.delete(0,sb.length());
sb.insert(0,input);

// stemming...
//removes case endings from nouns and adjectives
removeCase(sb);

//removes possessive endings from names -ov- and -in-
removePossessives(sb);

String result = sb.toString();


return result;
}
private void palatalise(StringBuffer buffer){
int len=buffer.length();

if( buffer.substring( len- 2 ,len).equals("ci")||
buffer.substring( len- 2 ,len).equals("ce")||
buffer.substring( len- 2 ,len).equals("\u010di")|| //-či
buffer.substring( len- 2 ,len).equals("\u010de")){ //-če

buffer.replace(len- 2 ,len, "k");
return;
}
if( buffer.substring( len- 2 ,len).equals("zi")||
buffer.substring( len- 2 ,len).equals("ze")||
buffer.substring( len- 2 ,len).equals("\u017ei")|| //-ži
buffer.substring( len- 2 ,len).equals("\u017ee")){ //-že

buffer.replace(len- 2 ,len, "h");
return;
}
if( buffer.substring( len- 3 ,len).equals("\u010dt\u011b")|| //-čtě
buffer.substring( len- 3 ,len).equals("\u010dti")|| //-čti
buffer.substring( len- 3 ,len).equals("\u010dt\u00ed")){ //-čtí

buffer.replace(len- 3 ,len, "ck");
return;
}
if( buffer.substring( len- 3 ,len).equals("\u0161t\u011b")|| //-ště
buffer.substring( len- 3 ,len).equals("\u0161ti")|| //-šti
buffer.substring( len- 3 ,len).equals("\u0161t\u00ed")){ //-ští

buffer.replace(len- 3 ,len, "sk");
return;
}
buffer.delete( len- 1 , len);
return;
}//palatalise

private void removePossessives(StringBuffer buffer) {
int len=buffer.length();

if( len> 5 ){
if( buffer.substring( len- 2 ,len).equals("ov")){

buffer.delete( len- 2 , len);
return;
}
if( buffer.substring( len-2,len).equals("\u016fv")){ //-ův

buffer.delete( len- 2 , len);
return;
}
if( buffer.substring( len- 2 ,len).equals("in")){

buffer.delete( len- 1 , len);
palatalise(buffer);
return;
}
}
return;
}//removePossessives

private void removeCase(StringBuffer buffer) {
int len=buffer.length();
//
if( (len> 7 )&&
buffer.substring( len- 5 ,len).equals("atech")){

buffer.delete( len- 5 , len);
return;
}//len>7
if( len> 6 ){
if(buffer.substring( len- 4 ,len).equals("\u011btem")){ //-ětem

buffer.delete( len- 3 , len);
palatalise(buffer);
return;
}
if(buffer.substring( len- 4 ,len).equals("at\u016fm")){ //-atům
buffer.delete( len- 4 , len);
return;
}

}
if( len> 5 ){
if(buffer.substring( len-3,len).equals("ech")||
buffer.substring( len-3,len).equals("ich")||
buffer.substring( len-3,len).equals("\u00edch")){ //-ích

buffer.delete( len-2 , len);
palatalise(buffer);
return;
}
if(buffer.substring( len-3,len).equals("\u00e9ho")|| //-ého
buffer.substring( len-3,len).equals("\u011bmi")|| //-ěmi
buffer.substring( len-3,len).equals("emi")||
buffer.substring( len-3,len).equals("\u00e9mu")|| //-ému
buffer.substring( len-3,len).equals("\u011bte")|| //-ěte
buffer.substring( len-3,len).equals("\u011bti")|| //-ěti
buffer.substring( len-3,len).equals("iho")||
buffer.substring( len-3,len).equals("\u00edho")|| //-ího
buffer.substring( len-3,len).equals("\u00edmi")|| //-ími
buffer.substring( len-3,len).equals("imu")){

buffer.delete( len- 2 , len);
palatalise(buffer);
return;
}
if( buffer.substring( len-3,len).equals("\u00e1ch")|| //-ách
buffer.substring( len-3,len).equals("ata")||
buffer.substring( len-3,len).equals("aty")||
buffer.substring( len-3,len).equals("\u00fdch")|| //-ých
buffer.substring( len-3,len).equals("ama")||
buffer.substring( len-3,len).equals("ami")||
buffer.substring( len-3,len).equals("ov\u00e9")|| //-ové
buffer.substring( len-3,len).equals("ovi")||
buffer.substring( len-3,len).equals("\u00fdmi")){ //-ými

buffer.delete( len- 3 , len);
return;
}
}
if( len> 4){
if(buffer.substring( len-2,len).equals("em")){

buffer.delete( len- 1 , len);
palatalise(buffer);
return;

}
if( buffer.substring( len-2,len).equals("es")||
buffer.substring( len-2,len).equals("\u00e9m")|| //-ém
buffer.substring( len-2,len).equals("\u00edm")){ //-ím

buffer.delete( len- 1 , len);
palatalise(buffer);
return;
}
if( buffer.substring( len-2,len).equals("\u016fm")){ //-ům

buffer.delete( len- 2 , len);
return;
}
if( buffer.substring( len-2,len).equals("at")||
buffer.substring( len-2,len).equals("\u00e1m")|| //-ám
buffer.substring( len-2,len).equals("os")||
buffer.substring( len-2,len).equals("us")||
buffer.substring( len-2,len).equals("\u00fdm")|| //-ým
buffer.substring( len-2,len).equals("mi")||
buffer.substring( len-2,len).equals("ou")){

buffer.delete( len- 2 , len);
return;
}
}//len>4
if( len> 3){
if( buffer.substring( len-1,len).equals("e")||
buffer.substring( len-1,len).equals("i")){

palatalise(buffer);
return;
}
if( buffer.substring( len-1,len).equals("\u00ed")|| //-í
buffer.substring( len-1,len).equals("\u011b")){ //-ě

palatalise(buffer);
return;
}
if( buffer.substring( len-1,len).equals("u")||
buffer.substring( len-1,len).equals("y")||
buffer.substring( len-1,len).equals("\u016f")){ //-ů

buffer.delete( len- 1 , len);
return;

}
if( buffer.substring( len-1,len).equals("a")||
buffer.substring( len-1,len).equals("o")||
buffer.substring( len-1,len).equals("\u00e1")|| // -á
buffer.substring( len-1,len).equals("\u00e9")|| //-é
buffer.substring( len-1,len).equals("\u00fd")){ //-ý

buffer.delete( len- 1 , len);
return;
}
}//len>3
}


private static void usage()
{
System.err.println("Usage: TestApp <algorithm> [<input file>] [-o <output file>]");
}

public static void main(String [] args) throws Throwable {
if (args.length < 1) {
usage();
return;
}

CzechStemmerLight stemmer = new CzechStemmerLight();

int arg = 1;

InputStream instream;
if (args.length > arg && !args[arg].equals("-o")) {
instream = new FileInputStream(args[arg++]);
} else {
instream = System.in;
}

OutputStream outstream;
if (args.length > arg) {
if (args.length != arg + 2 || !args[arg].equals("-o")) {
usage();
return;
}
outstream = new FileOutputStream(args[arg + 1]);
} else {
outstream = System.out;
}

Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8);
reader = new BufferedReader(reader);

Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8);
output = new BufferedWriter(output);

StringBuffer input = new StringBuffer();
int character;
while ((character = reader.read()) != -1) {
char ch = (char) character;
if (Character.isWhitespace(ch)) {
String result = stemmer.stem(input.toString());
output.write(result);
output.write('\n');
input.delete(0, input.length());
} else {
input.append(ch < 127 ? Character.toLowerCase(ch) : ch);
}
}
output.flush();
}

}//CzechStemmer_1
Loading