Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
requirements-file: ["requirements.txt"]
runs-on: ${{ matrix.os }}
steps:
Expand Down
45 changes: 24 additions & 21 deletions bin/opusfilter
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,27 @@ from opusfilter.util import yaml
logging.basicConfig(level=logging.INFO)
logging.getLogger('mosestokenizer.tokenizer.MosesTokenizer').setLevel(logging.WARNING)

parser = argparse.ArgumentParser(prog='opusfilter',
description='Filter OPUS bitexts')

parser.add_argument('config', metavar='CONFIG', help='YAML configuration file')
parser.add_argument('--overwrite', '-o', help='overwrite existing output files', action='store_true')
parser.add_argument('--last', type=int, default=None, help='Last step to run')
parser.add_argument('--single', type=int, default=None, help='Run only the nth step')
parser.add_argument('--n-jobs', type=int, default=None,
help='Number of parallel jobs when running score, filter and preprocess.')

args = parser.parse_args()

configuration = yaml.load(open(args.config))
if args.n_jobs is not None:
configuration['common']['default_n_jobs'] = args.n_jobs

of = OpusFilter(configuration)
if args.single is None:
of.execute_steps(overwrite=args.overwrite, last=args.last)
else:
of.execute_step(args.single, overwrite=args.overwrite)

if __name__ == '__main__':

parser = argparse.ArgumentParser(prog='opusfilter',
description='Filter OPUS bitexts')

parser.add_argument('config', metavar='CONFIG', help='YAML configuration file')
parser.add_argument('--overwrite', '-o', help='overwrite existing output files', action='store_true')
parser.add_argument('--last', type=int, default=None, help='Last step to run')
parser.add_argument('--single', type=int, default=None, help='Run only the nth step')
parser.add_argument('--n-jobs', type=int, default=None,
help='Number of parallel jobs when running score, filter and preprocess.')

args = parser.parse_args()

configuration = yaml.load(open(args.config))
if args.n_jobs is not None:
configuration['common']['default_n_jobs'] = args.n_jobs

of = OpusFilter(configuration)
if args.single is None:
of.execute_steps(overwrite=args.overwrite, last=args.last)
else:
of.execute_step(args.single, overwrite=args.overwrite)
128 changes: 65 additions & 63 deletions bin/opusfilter-autogen
Original file line number Diff line number Diff line change
Expand Up @@ -18,74 +18,76 @@ except OSError:

logger = logging.getLogger(__name__)

logging.basicConfig(level=logging.INFO)
logging.getLogger('mosestokenizer.tokenizer.MosesTokenizer').setLevel(logging.WARNING)
if __name__ == '__main__':

parser = argparse.ArgumentParser(
prog='opusfilter-autogen',
description='Generate initial configuration based on parallel text data')
logging.basicConfig(level=logging.INFO)
logging.getLogger('mosestokenizer.tokenizer.MosesTokenizer').setLevel(logging.WARNING)

parser.add_argument('--files', required=True, nargs='+', metavar='TEXTFILE', help='parallel text input file(s)')
parser.add_argument('--langs', nargs='+', metavar='LANGCODE',
help='Language codes corresponding to the input files. If omitted, LanguageIDFilters will not be used.')
parser.add_argument('--scripts', nargs='+', metavar='SCRIPT', help=(
'Alphabetic scripts (e.g. Latin) corresponding to the input files. '
'If omitted, CharacterScoreFilter will not be used.'))
parser.add_argument('--method', choices=['defaults', 'percentiles', 'clustering'], default='clustering',
help='Method for selecting filter thresholds (default: %(default)s)')
parser.add_argument('--sample-size', default=100000, type=int, metavar='INT',
help='Max number of sentence pairs used for data-based methods (default %(default)s)')
parser.add_argument('--noisy-percentile', default=0.001, type=float, metavar='FLOAT',
help='Proportion of the data considered to be noisy; only for percentiles method (default %(default)s)')
parser.add_argument('--clusters', '-k', default=2, type=int, metavar='INT',
help=('Number of clusters for the clustering method; try increasing if too much data is clustered '
'as noisy (default %(default)s)'))
parser.add_argument('--work-dir', default='work',
help='Location of the source and target files for the generated configuration (default %(default)s)')
parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)')
parser.add_argument('--plot', metavar='PATH', default=None, type=str,
help=('Create histograms of feature data distributions and a scatter plot of the clustering; '
'give path to plot the PDF files to, or "-" for interactive plots; only for the clustering method'))
parser.add_argument('--list-defaults', action='store_true', help='List default filters of the method to the output and quit')
parser.add_argument('--add-filter', nargs=2, action='append', default=[], metavar=('CLASS', 'JSON'),
help=('Instead of using default filters, add a filter of CLASS with JSON parameters object '
'("{}" for default parameters). The class name may be followed by a dot and a unique '
'filter identifier in order to allow multiple filters of the same class. Example: '
'--add-filter LanguageIDFilter.cld2 \'{"id_method": "cld2"}\''))
parser.add_argument('--overwrite', action='store_true',
help='Overwrite existing intermediate files')
parser.add_argument('-o', '--output', type=argparse.FileType('w'),
default='-', metavar='CONFIGFILE', help='Output configuration file (default %(default)s)')
args = parser.parse_args()
parser = argparse.ArgumentParser(
prog='opusfilter-autogen',
description='Generate initial configuration based on parallel text data')

filters = [(name, json.loads(jsonstr)) for name, jsonstr in args.add_filter] if args.add_filter else None
parser.add_argument('--files', required=True, nargs='+', metavar='TEXTFILE', help='parallel text input file(s)')
parser.add_argument('--langs', nargs='+', metavar='LANGCODE',
help='Language codes corresponding to the input files. If omitted, LanguageIDFilters will not be used.')
parser.add_argument('--scripts', nargs='+', metavar='SCRIPT', help=(
'Alphabetic scripts (e.g. Latin) corresponding to the input files. '
'If omitted, CharacterScoreFilter will not be used.'))
parser.add_argument('--method', choices=['defaults', 'percentiles', 'clustering'], default='clustering',
help='Method for selecting filter thresholds (default: %(default)s)')
parser.add_argument('--sample-size', default=100000, type=int, metavar='INT',
help='Max number of sentence pairs used for data-based methods (default %(default)s)')
parser.add_argument('--noisy-percentile', default=0.001, type=float, metavar='FLOAT',
help='Proportion of the data considered to be noisy; only for percentiles method (default %(default)s)')
parser.add_argument('--clusters', '-k', default=2, type=int, metavar='INT',
help=('Number of clusters for the clustering method; try increasing if too much data is clustered '
'as noisy (default %(default)s)'))
parser.add_argument('--work-dir', default='work',
help='Location of the source and target files for the generated configuration (default %(default)s)')
parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)')
parser.add_argument('--plot', metavar='PATH', default=None, type=str,
help=('Create histograms of feature data distributions and a scatter plot of the clustering; '
'give path to plot the PDF files to, or "-" for interactive plots; only for the clustering method'))
parser.add_argument('--list-defaults', action='store_true', help='List default filters of the method to the output and quit')
parser.add_argument('--add-filter', nargs=2, action='append', default=[], metavar=('CLASS', 'JSON'),
help=('Instead of using default filters, add a filter of CLASS with JSON parameters object '
'("{}" for default parameters). The class name may be followed by a dot and a unique '
'filter identifier in order to allow multiple filters of the same class. Example: '
'--add-filter LanguageIDFilter.cld2 \'{"id_method": "cld2"}\''))
parser.add_argument('--overwrite', action='store_true',
help='Overwrite existing intermediate files')
parser.add_argument('-o', '--output', type=argparse.FileType('w'),
default='-', metavar='CONFIGFILE', help='Output configuration file (default %(default)s)')
args = parser.parse_args()

if args.method == 'clustering':
filtergen = ClusterFilters(
files=args.files, langs=args.langs, scripts=args.scripts, filters=filters,
sample_size=args.sample_size, k=args.clusters, inter_dir=args.inter_dir, overwrite=args.overwrite)
elif args.method == 'percentiles':
filtergen = PercentileFilters(
files=args.files, langs=args.langs, scripts=args.scripts, filters=filters,
excluded_percentile=args.noisy_percentile, sample_size=args.sample_size,
inter_dir=args.inter_dir, overwrite=args.overwrite)
else:
filtergen = DefaultParameterFilters(langs=args.langs, scripts=args.scripts, filters=filters)
filters = [(name, json.loads(jsonstr)) for name, jsonstr in args.add_filter] if args.add_filter else None

if args.list_defaults:
yaml.dump(filtergen.DEFAULT_FILTERS, args.output)
sys.exit(0)
if args.method == 'clustering':
filtergen = ClusterFilters(
files=args.files, langs=args.langs, scripts=args.scripts, filters=filters,
sample_size=args.sample_size, k=args.clusters, inter_dir=args.inter_dir, overwrite=args.overwrite)
elif args.method == 'percentiles':
filtergen = PercentileFilters(
files=args.files, langs=args.langs, scripts=args.scripts, filters=filters,
excluded_percentile=args.noisy_percentile, sample_size=args.sample_size,
inter_dir=args.inter_dir, overwrite=args.overwrite)
else:
filtergen = DefaultParameterFilters(langs=args.langs, scripts=args.scripts, filters=filters)

filters = filtergen.set_filter_thresholds()
if args.list_defaults:
yaml.dump(filtergen.DEFAULT_FILTERS, args.output)
sys.exit(0)

if args.method == 'clustering' and args.plot is not None:
if args.plot == '-':
filtergen.scoredata.plot(plt)
plt.show()
else:
filtergen.scoredata.plot(plt, path=args.plot)
filters = filtergen.set_filter_thresholds()

if args.method == 'clustering' and args.plot is not None:
if args.plot == '-':
filtergen.scoredata.plot(plt)
plt.show()
else:
filtergen.scoredata.plot(plt, path=args.plot)

generator = ConfigurationGenerator(
files=[os.path.abspath(f) for f in args.files], langs=args.langs, workdir=args.work_dir)
generator.add_filter(filtergen.filters)
yaml.dump(generator.get_config(), args.output)
generator = ConfigurationGenerator(
files=[os.path.abspath(f) for f in args.files], langs=args.langs, workdir=args.work_dir)
generator.add_filter(filtergen.filters)
yaml.dump(generator.get_config(), args.output)
94 changes: 48 additions & 46 deletions bin/opusfilter-cmd
Original file line number Diff line number Diff line change
Expand Up @@ -38,49 +38,51 @@ def update_parameters(parameters, name, values):
parameters[name].extend(values)


# Use to prevent warning from missing directory
tmpconfig = {'common': {'output_directory': '/tmp'}}

logging.basicConfig(level=logging.INFO)
logging.getLogger('mosestokenizer.tokenizer.MosesTokenizer').setLevel(logging.WARNING)

parser = argparse.ArgumentParser(
prog='opusfilter-cmd', description='Run single opusfilter function', allow_abbrev=False)

parser.add_argument('function', choices=OpusFilter(tmpconfig).step_functions, help='OpusFilter function')
parser.add_argument('--overwrite', '-o', help='overwrite existing output files', action='store_true')
parser.add_argument('--outputdir', '-d', default='.', help='output directory')
parser.add_argument('--parameters', type=str, default=None, help='load parameters as a JSON object (e.g. \'{"inputs": ["all.gz"], "outputs": ["filtered.gz"]}\')')

args, remaining = parser.parse_known_args()

if args.parameters is None:
parameters = {}
else:
parameters = json.loads(args.parameters)

temp = copy.copy(remaining)
name = None
values = []
while temp:
new = temp.pop(0)
if new.startswith('--'):
if name is not None:
update_parameters(parameters, name, values)
name = new[2:].replace('-', '_')
values = []
continue
if name is None:
raise ValueError("Could not parse remaining arguments: %s" % remaining)
values.append(json_value(new))
update_parameters(parameters, name, values)

configuration = {
'common': {'output_directory': args.outputdir},
'steps': [{'type': args.function, 'parameters': parameters}]
}

logger.info("Created configuration:\n\n%s", yaml_dumps(configuration))

of = OpusFilter(configuration)
of.execute_steps(overwrite=args.overwrite)
if __name__ == '__main__':

# Use to prevent warning from missing directory
tmpconfig = {'common': {'output_directory': '/tmp'}}

logging.basicConfig(level=logging.INFO)
logging.getLogger('mosestokenizer.tokenizer.MosesTokenizer').setLevel(logging.WARNING)

parser = argparse.ArgumentParser(
prog='opusfilter-cmd', description='Run single opusfilter function', allow_abbrev=False)

parser.add_argument('function', choices=OpusFilter(tmpconfig).step_functions, help='OpusFilter function')
parser.add_argument('--overwrite', '-o', help='overwrite existing output files', action='store_true')
parser.add_argument('--outputdir', '-d', default='.', help='output directory')
parser.add_argument('--parameters', type=str, default=None, help='load parameters as a JSON object (e.g. \'{"inputs": ["all.gz"], "outputs": ["filtered.gz"]}\')')

args, remaining = parser.parse_known_args()

if args.parameters is None:
parameters = {}
else:
parameters = json.loads(args.parameters)

temp = copy.copy(remaining)
name = None
values = []
while temp:
new = temp.pop(0)
if new.startswith('--'):
if name is not None:
update_parameters(parameters, name, values)
name = new[2:].replace('-', '_')
values = []
continue
if name is None:
raise ValueError("Could not parse remaining arguments: %s" % remaining)
values.append(json_value(new))
update_parameters(parameters, name, values)

configuration = {
'common': {'output_directory': args.outputdir},
'steps': [{'type': args.function, 'parameters': parameters}]
}

logger.info("Created configuration:\n\n%s", yaml_dumps(configuration))

of = OpusFilter(configuration)
of.execute_steps(overwrite=args.overwrite)
Loading
Loading