diff --git a/.gitmodules b/.gitmodules index b14568b2..e69de29b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,10 +0,0 @@ -[submodule "thrax"] - path = thrax - url = https://github.com/joshua-decoder/thrax.git -[submodule "berkeleylm"] - path = ext/berkeleylm - url = https://github.com/joshua-decoder/berkeleylm.git -[submodule "ext/kenlm"] - path = ext/kenlm - url = https://github.com/kpu/kenlm.git - branch = 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/.project b/.project deleted file mode 100644 index 7b6ed8ee..00000000 --- a/.project +++ /dev/null @@ -1,18 +0,0 @@ - - - joshua - - - - - - - org.eclipse.jdt.core.javabuilder - - - - - - org.eclipse.jdt.core.javanature - - \ No newline at end of file diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs deleted file mode 100644 index 578dc360..00000000 --- a/.settings/org.eclipse.core.resources.prefs +++ /dev/null @@ -1,3 +0,0 @@ -#Fri Sep 02 17:42:51 EDT 2011 -eclipse.preferences.version=1 -encoding/=UTF-8 diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs deleted file mode 100644 index 084c1fdd..00000000 --- a/.settings/org.eclipse.jdt.core.prefs +++ /dev/null @@ -1,285 +0,0 @@ -eclipse.preferences.version=1 -org.eclipse.jdt.core.formatter.align_type_members_on_columns=false -org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression=16 -org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation=0 -org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant=16 -org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call=16 -org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation=16 -org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression=16 -org.eclipse.jdt.core.formatter.alignment_for_assignment=0 -org.eclipse.jdt.core.formatter.alignment_for_binary_expression=16 -org.eclipse.jdt.core.formatter.alignment_for_compact_if=16 -org.eclipse.jdt.core.formatter.alignment_for_conditional_expression=80 -org.eclipse.jdt.core.formatter.alignment_for_enum_constants=0 -org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer=16 -org.eclipse.jdt.core.formatter.alignment_for_method_declaration=0 -org.eclipse.jdt.core.formatter.alignment_for_multiple_fields=16 -org.eclipse.jdt.core.formatter.alignment_for_parameters_in_constructor_declaration=16 -org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration=16 -org.eclipse.jdt.core.formatter.alignment_for_resources_in_try=80 -org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation=16 -org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration=16 -org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration=16 -org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration=16 -org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration=16 -org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration=16 -org.eclipse.jdt.core.formatter.alignment_for_union_type_in_multicatch=16 -org.eclipse.jdt.core.formatter.blank_lines_after_imports=1 -org.eclipse.jdt.core.formatter.blank_lines_after_package=1 -org.eclipse.jdt.core.formatter.blank_lines_before_field=0 -org.eclipse.jdt.core.formatter.blank_lines_before_first_class_body_declaration=0 -org.eclipse.jdt.core.formatter.blank_lines_before_imports=1 -org.eclipse.jdt.core.formatter.blank_lines_before_member_type=1 -org.eclipse.jdt.core.formatter.blank_lines_before_method=1 -org.eclipse.jdt.core.formatter.blank_lines_before_new_chunk=1 -org.eclipse.jdt.core.formatter.blank_lines_before_package=0 -org.eclipse.jdt.core.formatter.blank_lines_between_import_groups=1 -org.eclipse.jdt.core.formatter.blank_lines_between_type_declarations=1 -org.eclipse.jdt.core.formatter.brace_position_for_annotation_type_declaration=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_anonymous_type_declaration=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_array_initializer=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_block=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_block_in_case=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_constructor_declaration=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_enum_constant=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_enum_declaration=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_lambda_body=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_method_declaration=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_switch=end_of_line -org.eclipse.jdt.core.formatter.brace_position_for_type_declaration=end_of_line -org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment=false -org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment=false -org.eclipse.jdt.core.formatter.comment.format_block_comments=true -org.eclipse.jdt.core.formatter.comment.format_header=true -org.eclipse.jdt.core.formatter.comment.format_html=true -org.eclipse.jdt.core.formatter.comment.format_javadoc_comments=true -org.eclipse.jdt.core.formatter.comment.format_line_comments=true -org.eclipse.jdt.core.formatter.comment.format_source_code=true -org.eclipse.jdt.core.formatter.comment.indent_parameter_description=true -org.eclipse.jdt.core.formatter.comment.indent_root_tags=true -org.eclipse.jdt.core.formatter.comment.insert_new_line_before_root_tags=insert -org.eclipse.jdt.core.formatter.comment.insert_new_line_for_parameter=do not insert -org.eclipse.jdt.core.formatter.comment.line_length=100 -org.eclipse.jdt.core.formatter.comment.new_lines_at_block_boundaries=true -org.eclipse.jdt.core.formatter.comment.new_lines_at_javadoc_boundaries=true -org.eclipse.jdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments=false -org.eclipse.jdt.core.formatter.compact_else_if=true -org.eclipse.jdt.core.formatter.continuation_indentation=2 -org.eclipse.jdt.core.formatter.continuation_indentation_for_array_initializer=2 -org.eclipse.jdt.core.formatter.disabling_tag=@formatter\:off -org.eclipse.jdt.core.formatter.enabling_tag=@formatter\:on -org.eclipse.jdt.core.formatter.format_guardian_clause_on_one_line=false -org.eclipse.jdt.core.formatter.format_line_comment_starting_on_first_column=true -org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_annotation_declaration_header=true -org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_constant_header=true -org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_declaration_header=true -org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_type_header=true -org.eclipse.jdt.core.formatter.indent_breaks_compare_to_cases=true -org.eclipse.jdt.core.formatter.indent_empty_lines=false -org.eclipse.jdt.core.formatter.indent_statements_compare_to_block=true -org.eclipse.jdt.core.formatter.indent_statements_compare_to_body=true -org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_cases=true -org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_switch=false -org.eclipse.jdt.core.formatter.indentation.size=2 -org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_field=insert -org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_local_variable=insert -org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_method=insert -org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_package=insert -org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_parameter=do not insert -org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_type=insert -org.eclipse.jdt.core.formatter.insert_new_line_after_label=do not insert -org.eclipse.jdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer=do not insert -org.eclipse.jdt.core.formatter.insert_new_line_after_type_annotation=do not insert -org.eclipse.jdt.core.formatter.insert_new_line_at_end_of_file_if_missing=do not insert -org.eclipse.jdt.core.formatter.insert_new_line_before_catch_in_try_statement=do not insert -org.eclipse.jdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer=do not insert -org.eclipse.jdt.core.formatter.insert_new_line_before_else_in_if_statement=do not insert -org.eclipse.jdt.core.formatter.insert_new_line_before_finally_in_try_statement=do not insert -org.eclipse.jdt.core.formatter.insert_new_line_before_while_in_do_statement=do not insert -org.eclipse.jdt.core.formatter.insert_new_line_in_empty_annotation_declaration=insert -org.eclipse.jdt.core.formatter.insert_new_line_in_empty_anonymous_type_declaration=insert -org.eclipse.jdt.core.formatter.insert_new_line_in_empty_block=insert -org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_constant=insert -org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_declaration=insert -org.eclipse.jdt.core.formatter.insert_new_line_in_empty_method_body=insert -org.eclipse.jdt.core.formatter.insert_new_line_in_empty_type_declaration=insert -org.eclipse.jdt.core.formatter.insert_space_after_and_in_type_parameter=insert -org.eclipse.jdt.core.formatter.insert_space_after_assignment_operator=insert -org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation_type_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_binary_operator=insert -org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_arguments=insert -org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_parameters=insert -org.eclipse.jdt.core.formatter.insert_space_after_closing_brace_in_block=insert -org.eclipse.jdt.core.formatter.insert_space_after_closing_paren_in_cast=insert -org.eclipse.jdt.core.formatter.insert_space_after_colon_in_assert=insert -org.eclipse.jdt.core.formatter.insert_space_after_colon_in_case=insert -org.eclipse.jdt.core.formatter.insert_space_after_colon_in_conditional=insert -org.eclipse.jdt.core.formatter.insert_space_after_colon_in_for=insert -org.eclipse.jdt.core.formatter.insert_space_after_colon_in_labeled_statement=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_allocation_expression=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_annotation=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_array_initializer=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_parameters=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_throws=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_constant_arguments=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_declarations=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_explicitconstructorcall_arguments=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_increments=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_inits=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_throws=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_field_declarations=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_local_declarations=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_parameterized_type_reference=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_superinterfaces=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_arguments=insert -org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_parameters=insert -org.eclipse.jdt.core.formatter.insert_space_after_ellipsis=insert -org.eclipse.jdt.core.formatter.insert_space_after_lambda_arrow=insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_parameterized_type_reference=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_arguments=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_parameters=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_brace_in_array_initializer=insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_allocation_expression=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_reference=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_annotation=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_cast=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_catch=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_constructor_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_enum_constant=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_for=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_if=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_invocation=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_switch=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_synchronized=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_try=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_while=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_postfix_operator=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_prefix_operator=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_question_in_conditional=insert -org.eclipse.jdt.core.formatter.insert_space_after_question_in_wildcard=do not insert -org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_for=insert -org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_try_resources=insert -org.eclipse.jdt.core.formatter.insert_space_after_unary_operator=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_and_in_type_parameter=insert -org.eclipse.jdt.core.formatter.insert_space_before_assignment_operator=insert -org.eclipse.jdt.core.formatter.insert_space_before_at_in_annotation_type_declaration=insert -org.eclipse.jdt.core.formatter.insert_space_before_binary_operator=insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_parameterized_type_reference=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_arguments=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_parameters=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_brace_in_array_initializer=insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_allocation_expression=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_reference=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_annotation=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_cast=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_catch=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_constructor_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_enum_constant=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_for=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_if=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_invocation=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_switch=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_synchronized=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_try=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_while=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_colon_in_assert=insert -org.eclipse.jdt.core.formatter.insert_space_before_colon_in_case=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_colon_in_conditional=insert -org.eclipse.jdt.core.formatter.insert_space_before_colon_in_default=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_colon_in_for=insert -org.eclipse.jdt.core.formatter.insert_space_before_colon_in_labeled_statement=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_allocation_expression=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_annotation=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_array_initializer=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_parameters=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_throws=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_constant_arguments=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_declarations=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_explicitconstructorcall_arguments=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_increments=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_inits=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_throws=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_field_declarations=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_local_declarations=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_parameterized_type_reference=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_superinterfaces=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_arguments=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_parameters=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_ellipsis=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_lambda_arrow=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_parameterized_type_reference=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_arguments=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_parameters=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_annotation_type_declaration=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_anonymous_type_declaration=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_array_initializer=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_block=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_constructor_declaration=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_constant=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_declaration=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_method_declaration=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_switch=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_type_declaration=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_allocation_expression=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_reference=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_type_reference=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation_type_member_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_catch=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_constructor_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_enum_constant=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_for=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_if=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_invocation=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_switch=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_synchronized=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_try=insert -org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_while=insert -org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_return=insert -org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_throw=insert -org.eclipse.jdt.core.formatter.insert_space_before_postfix_operator=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_prefix_operator=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_question_in_conditional=insert -org.eclipse.jdt.core.formatter.insert_space_before_question_in_wildcard=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_semicolon=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_for=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_try_resources=do not insert -org.eclipse.jdt.core.formatter.insert_space_before_unary_operator=do not insert -org.eclipse.jdt.core.formatter.insert_space_between_brackets_in_array_type_reference=do not insert -org.eclipse.jdt.core.formatter.insert_space_between_empty_braces_in_array_initializer=do not insert -org.eclipse.jdt.core.formatter.insert_space_between_empty_brackets_in_array_allocation_expression=do not insert -org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_annotation_type_member_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_constructor_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_enum_constant=do not insert -org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_declaration=do not insert -org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_invocation=do not insert -org.eclipse.jdt.core.formatter.join_lines_in_comments=true -org.eclipse.jdt.core.formatter.join_wrapped_lines=true -org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line=false -org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line=false -org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line=false -org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line=false -org.eclipse.jdt.core.formatter.lineSplit=100 -org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column=false -org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column=false -org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body=0 -org.eclipse.jdt.core.formatter.number_of_empty_lines_to_preserve=1 -org.eclipse.jdt.core.formatter.put_empty_statement_on_new_line=true -org.eclipse.jdt.core.formatter.tabulation.char=space -org.eclipse.jdt.core.formatter.tabulation.size=2 -org.eclipse.jdt.core.formatter.use_on_off_tags=true -org.eclipse.jdt.core.formatter.use_tabs_only_for_leading_indentations=false -org.eclipse.jdt.core.formatter.wrap_before_binary_operator=true -org.eclipse.jdt.core.formatter.wrap_before_or_operator_multicatch=true -org.eclipse.jdt.core.formatter.wrap_outer_expressions_when_nested=true diff --git a/.settings/org.eclipse.jdt.ui.prefs b/.settings/org.eclipse.jdt.ui.prefs deleted file mode 100644 index db2ff42c..00000000 --- a/.settings/org.eclipse.jdt.ui.prefs +++ /dev/null @@ -1,4 +0,0 @@ -#Fri Oct 12 07:45:18 EDT 2012 -eclipse.preferences.version=1 -formatter_profile=_Joshua -formatter_settings_version=12 diff --git a/README.md b/README.md index 343f1835..de8edf20 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,18 @@ The basic method for invoking the decoder looks like this: Some example usage scenarios and scripts can be found in the [examples/](https://github.com/apache/incubator-joshua/tree/master/examples) directory. +---- +### Maven Build + +### Create executable jar + + mvn clean compile assembly:single + +### Run the jar + + java -jar target/joshua-6.0.6-SNAPSHOT-jar-with-dependencies.jar + + ## Working with "language packs" Joshua includes a number of "language packs", which are pre-built models that diff --git a/bin/bleu b/bin/bleu index 8778e5b6..087164b1 100755 --- a/bin/bleu +++ b/bin/bleu @@ -1,5 +1,20 @@ #!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + if [[ -z $2 ]]; then echo "Usage: bleu output reference" exit 1 diff --git a/bin/extract-1best b/bin/extract-1best index c84dec1e..22bd827d 100755 --- a/bin/extract-1best +++ b/bin/extract-1best @@ -1,3 +1,18 @@ #!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + java -Xmx500m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.util.ExtractTopCand $1 - $2 diff --git a/bin/joshua-decoder b/bin/joshua-decoder index cdb2cf49..c752d03b 100755 --- a/bin/joshua-decoder +++ b/bin/joshua-decoder @@ -1,4 +1,20 @@ #!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # # Joshua decoder invocation script. # diff --git a/bin/meteor b/bin/meteor index 5f98a26c..6c9edf0b 100755 --- a/bin/meteor +++ b/bin/meteor @@ -1,5 +1,20 @@ #!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + if [[ -z $3 ]]; then echo "Usage: meteor output reference lang" exit 1 diff --git a/Dockerfile b/docker/Dockerfile similarity index 100% rename from Dockerfile rename to docker/Dockerfile diff --git a/examples/docker/ar-en-phrase/Dockerfile b/docker/ar-en-phrase/Dockerfile similarity index 100% rename from examples/docker/ar-en-phrase/Dockerfile rename to docker/ar-en-phrase/Dockerfile diff --git a/examples/docker/zh-en-hiero/Dockerfile b/docker/zh-en-hiero/Dockerfile similarity index 100% rename from examples/docker/zh-en-hiero/Dockerfile rename to docker/zh-en-hiero/Dockerfile diff --git a/examples/README.md b/examples/README.md index 6001e234..c2f14ee3 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,4 +1,4 @@ -# Examples +# Joshua Examples The examples in this directory demonstrate how to exercise different Joshua features. If you have any comments or questions please submit @@ -10,4 +10,39 @@ Bugs or source code issues should be logged in our The decoding examples and model training examples in the subdirectories of this directory assume you have downloaded the Fisher Spanish--English dataset, which contains speech-recognizer output paired with English translations. This data -can be downloaded by running the [download.sh](https://github.com/apache/incubator-joshua/blob/master/examples/download.sh) script. +can be downloaded by running the [download.sh](https://github.com/apache/incubator-joshua/blob/master/src/examples/resources/download.sh) script. + +# Building a Spanish --> English Translation Model using the Fisher Spanish CALLHOME corpus + +An example of how to build a model using the Fisher Spanish CALLHOME corpus + +A) Download the corpus: + 1) mkdir $HOME/git + 2) cd $HOME/git + 3) curl -o fisher-callhome-corpus.zip https://codeload.github.com/joshua-decoder/fisher-callhome-corpus/legacy.zip/master + 4) unzip fisher-callhome-corpus.zip + 5) # Set environment variable SPANISH=$HOME/git/fisher-callhome-corpus + 5) mv joshua-decoder-*/ fisher-callhome-corpus + +B) Download and install Joshua: + 1) cd /directory/to/install/ + 2) git clone https://github.com/apache/incubator-joshua.git + 3) cd incubator-joshua + 4) # Set environment variable JAVA_HOME=/path/to/java # Try $(readlink -f /usr/bin/javac | sed "s:/bin/javac::") + 5) # Set environment variable JOSHUA=/directory/to/install/joshua + 6) mvn install + +C) Train the model: + 1) mkdir -p $HOME/expts/joshua && cd $HOME/expts/joshua + 2) $JOSHUA/bin/pipeline.pl \ + --rundir 1 \ + --readme "Baseline Hiero run" \ + --source es \ + --target en \ + --lm-gen srilm \ + --witten-bell \ + --corpus $SPANISH/corpus/asr/callhome_train \ + --corpus $SPANISH/corpus/asr/fisher_train \ + --tune $SPANISH/corpus/asr/fisher_dev \ + --test $SPANISH/corpus/asr/callhome_devtest \ + --lm-order 3 \ No newline at end of file diff --git a/examples/README.sp_to_en b/examples/README.sp_to_en deleted file mode 100644 index 95e99bc9..00000000 --- a/examples/README.sp_to_en +++ /dev/null @@ -1,32 +0,0 @@ -An example of how to build a model using the Fisher Spanish CALLHOME corpus - -A) Download the corpus: - 1) mkdir $HOME/git - 2) cd $HOME/git - 3) curl -o fisher-callhome-corpus.zip https://codeload.github.com/joshua-decoder/fisher-callhome-corpus/legacy.zip/master - 4) unzip fisher-callhome-corpus.zip - 5) # Set environment variable SPANISH=$HOME/git/fisher-callhome-corpus - 5) mv joshua-decoder-*/ fisher-callhome-corpus - -B) Download and install Joshua: - 1) cd /directory/to/install/ - 2) git clone https://github.com/joshua-decoder/joshua.git - 3) cd joshua - 4) # Set environment variable JAVA_HOME=/path/to/java # Try $(readlink -f /usr/bin/javac | sed "s:/bin/javac::") - 5) # Set environment variable JOSHUA=/directory/to/install/joshua - 6) ant devel - -C) Train the model: - 1) mkdir -p $HOME/expts/joshua && cd $HOME/expts/joshua - 2) $JOSHUA/bin/pipeline.pl \ - --rundir 1 \ - --readme "Baseline Hiero run" \ - --source es \ - --target en \ - --lm-gen srilm \ - --witten-bell \ - --corpus $SPANISH/corpus/asr/callhome_train \ - --corpus $SPANISH/corpus/asr/fisher_train \ - --tune $SPANISH/corpus/asr/fisher_dev \ - --test $SPANISH/corpus/asr/callhome_devtest \ - --lm-order 3 diff --git a/ext/berkeleylm b/ext/berkeleylm deleted file mode 160000 index c431057d..00000000 --- a/ext/berkeleylm +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c431057d7512d897146ebccdf0f446d387397702 diff --git a/ext/giza-pp/GIZA++-v2/ATables.cpp b/ext/giza-pp/GIZA++-v2/ATables.cpp deleted file mode 100644 index 44011949..00000000 --- a/ext/giza-pp/GIZA++-v2/ATables.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "ATables.h" -#include "Globals.h" -#include "myassert.h" -#include "Parameter.h" - -GLOBAL_PARAMETER(bool,CompactADTable,"compactadtable","1: only 3-dimensional alignment table for IBM-2 and IBM-3",PARLEV_MODELS,1); -GLOBAL_PARAMETER(float,amodel_smooth_factor,"model23SmoothFactor","smoothing parameter for IBM-2/3 (interpolation with constant)",PARLEV_SMOOTH,0.0); - -template -void amodel::printTable(const char *filename) const - // print amodel to file with the name filename (it'll be created or overwritten - // format : for a table : - // aj j l m val - // where aj is source word pos, j target word pos, l source sentence length, - // m is target sentence length. - // -{ - //return; - if (is_distortion) - cout << "Dumping distortion table (d) to file:" << filename <<'\n'; - else - cout << "Dumping alignment table (a) to file:" << filename <<'\n'; - - ofstream of(filename); - double ssum=0.0; - for(WordIndex l=0; l < MaxSentLength; l++) - for(WordIndex m=0;mPROB_SMOOTH ) - { - of << i << ' ' << j << ' ' << L << ' ' << M << ' ' << x << '\n'; - sum+=x; - } - } - ssum+=sum; - } - else - for(WordIndex i=0;i<=L;i++) - { - double sum=0.0; - for(WordIndex j=1;j<=M;j++) - - { - VALTYPE x=getValue(j, i, L, M); - if( x>PROB_SMOOTH ) - { - of << j << ' ' << i << ' ' << L << ' ' << M << ' ' << x << '\n'; - sum+=x; - } - } - ssum+=sum; - } - } -} - -extern short NoEmptyWord; - -template -void amodel::readTable(const char *filename) -{ - /* This function reads the a table from a file. - Each line is of the format: aj j l m val - where aj is the source word position, j the target word position, - l the source sentence length, and m the target sentence length - - This function also works for a d table, where the positions - of aj and i are swapped. Both the a and d tables are 4 dimensional - hashes; this function will simply read in the four values and keep - them in that order when hashing the fifth value. - NAS, 7/11/99 - */ - ifstream inf(filename); - cout << "Reading a/d table from " << filename << "\n"; - if(!inf){ - cerr << "\nERROR: Cannot open " << filename<<"\n"; - return; - } - WordIndex w, x, l, m; - VALTYPE prob; - while(inf >> w >> x >> l >> m >> prob ) - // the NULL word is added to the length - // of the sentence in the tables, but discount it when you write the tables. - setValue(w, x, l, m, prob); -} - -template class amodel ; -//template class amodel ; diff --git a/ext/giza-pp/GIZA++-v2/ATables.h b/ext/giza-pp/GIZA++-v2/ATables.h deleted file mode 100644 index 70d5030d..00000000 --- a/ext/giza-pp/GIZA++-v2/ATables.h +++ /dev/null @@ -1,172 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/* --------------------------------------------------------------------------* - * * - * Module :ATables * - * * - * Prototypes File: ATables.h * - * * - * Objective: Defines clases and methods for handling I/O for distortion & * - * alignment tables. * - *****************************************************************************/ - -#ifndef _atables_h -#define _atables_h 1 - -#include "defs.h" -#include -#include -#include -#include -#include -#include -#include "Vector.h" -#include -#include -#include "Array4.h" -#include "myassert.h" -#include "Globals.h" - -extern bool CompactADTable; -extern float amodel_smooth_factor; -extern short NoEmptyWord; - -/* ------------------- Class Defintions of amodel ---------------------------*/ -/* Class Name: amodel: - Objective: This defines the underlying data structure for distortiont prob. - and count tables. They are defined as a hash table. Each entry in the hash - table is the probability (d(j/l,m,i), where j is word target position, i is - source word position connected to it, m is target sentence length, and l is - source sentence length) or count collected for it. The probability and the - count are represented as log integer probability as - defined by the class LogProb . - - This class is used to represents a Tables (probabiliity) and d (distortion) - tables and also their corresponding count tables . - - *--------------------------------------------------------------------------*/ - -inline int Mabs(int a) -{ - if(a<0) - return -a; - else - return a; -} - -template -class amodel -{ - public: - Array4 a; - bool is_distortion ; - WordIndex MaxSentLength; - bool ignoreL, ignoreM; - VALTYPE get(WordIndex aj, WordIndex j, WordIndex l, WordIndex m)const - { - massert( (!is_distortion) || aj<=m );massert( (!is_distortion) || j<=l );massert( (!is_distortion) || aj!=0 ); - massert( is_distortion || aj<=l );massert( is_distortion || j<=m );massert( (is_distortion) || j!=0 ); - massert( l - void normalize(amodel& aTable)const - { - WordIndex i, j, l, m ; - COUNT total; - int nParam=0; - for(l=0;l& aj, LogProb val) -{ - hash_map, LogProb, hashmyalignment, equal_to_myalignment >::iterator i; - i = a.find(aj); - if(i != a.end() || val <= 0) - return false ; - a.insert(pair, LogProb>(aj, val)); - return true ; -} - - -LogProb alignmodel::getValue(Vector& align) const -{ - const LogProb zero = 0.0 ; - hash_map, LogProb, hashmyalignment, equal_to_myalignment >::const_iterator i; - i = a.find(align); - if(i == a.end()) - return zero; - else - return (*i).second; -} diff --git a/ext/giza-pp/GIZA++-v2/AlignTables.h b/ext/giza-pp/GIZA++-v2/AlignTables.h deleted file mode 100644 index 0daa2c30..00000000 --- a/ext/giza-pp/GIZA++-v2/AlignTables.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef _aligntables_h -#define _aligntables_h 1 - -#include "defs.h" - - -#include - -#include -#include -#include -#include -#include -//#include -#include "Vector.h" -#include -#include -#include -#include "transpair_model1.h" - - -/* ----------------- Class Defintions for hashmyalignment -------------------- - Objective: This class is used to define a hash mapping function to map - an alignment (defined as a vector of integers) into a hash key - ----------------------------------------------------------------------------*/ - -class hashmyalignment : public unary_function< Vector, size_t > -{ -public: - size_t operator() (const Vector& key) const - // to define the mapping function. it takes an alignment (a vector of - // integers) and it returns an integer value (hash key). - { - WordIndex j ; - size_t s ; - size_t key_sum = 0 ; - // logmsg << "For alignment:" ; - for (j = 1 ; j < key.size() ; j++){ - // logmsg << " " << key[j] ; - key_sum += (size_t) (int) pow(double(key[j]), double((j % 6)+1)); - } - // logmsg << " , Key value was : " << key_sum; - s = key_sum % 1000000 ; - // logmsg << " h(k) = " << s << endl ; - return(s); - } -}; - -class equal_to_myalignment{ - // returns true if two alignments are the same (two vectors have same enties) -public: - bool operator()(const Vector t1, - const Vector t2) const - {WordIndex j ; - if (t1.size() != t2.size()) - return(false); - for (j = 1 ; j < t1.size() ; j++) - if (t1[j] != t2[j]) - return(false); - return(true); - } - -}; - -/* ---------------- End of Class Defnition for hashmyalignment --------------*/ - - -/* ------------------ Class Defintions for alignmodel ----------------------- - Class Name: alignmodel - Objective: Alignments neighborhhoods (collection of alignments) are stored in - a hash table (for easy lookup). Each alignment vector is mapped into a hash - key using the operator defined above. - *--------------------------------------------------------------------------*/ - -class alignmodel{ -private: - hash_map, LogProb, hashmyalignment, equal_to_myalignment > a; -private: - // void erase(Vector&); -public: - - // methods; - - inline hash_map, LogProb, hashmyalignment, equal_to_myalignment >::iterator begin(void){return a.begin();} // begining of hash - inline hash_map, LogProb, hashmyalignment, equal_to_myalignment >::iterator end(void){return a.end();} // end of hash - inline const hash_map, LogProb, hashmyalignment, equal_to_myalignment >& getHash() const {return a;}; // reference to hash table - bool insert(Vector&, LogProb val=0.0); // add a alignmnet - // void setValue(Vector&, LogProb val); // not needed - LogProb getValue(Vector&)const; // retrieve prob. of alignment - inline void clear(void){ a.clear();}; // clear hash table - // void printTable(const char* filename); - //inline void resize(WordIndex n) {a.resize(n);}; // resize table - -}; - -/* -------------- End of alignmode Class Definitions ------------------------*/ -#endif diff --git a/ext/giza-pp/GIZA++-v2/Array.h b/ext/giza-pp/GIZA++-v2/Array.h deleted file mode 100644 index eae58d43..00000000 --- a/ext/giza-pp/GIZA++-v2/Array.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifndef GIZA_ARRAY_H_DEFINED -#define GIZA_ARRAY_H_DEFINED -#include "Vector.h" -#define Array Vector -#endif diff --git a/ext/giza-pp/GIZA++-v2/Array2.h b/ext/giza-pp/GIZA++-v2/Array2.h deleted file mode 100644 index 546d63a4..00000000 --- a/ext/giza-pp/GIZA++-v2/Array2.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/*-- -Array2: Implementation of a twodimensional checked array allowing for -a specified underlieing one-dimensional data-structure. - -Franz Josef Och (30/07/99) ---*/ -#ifndef CLASS_Array2_DEFINED -#define CLASS_Array2_DEFINED - -#include "mystl.h" -#include -#include - -template > class Array2 -{ -private: - Y p; - // short h1, h2; - unsigned int h1, h2; -public: - Array2(unsigned int _h1, unsigned int _h2) - : p(_h1*_h2), h1(_h1), h2(_h2) {} - Array2(unsigned int _h1, unsigned int _h2, const T&_init) - : p(_h1*_h2, _init), h1(_h1), h2(_h2) {} - Array2() - : h1(0), h2(0) {} - inline T &operator()(unsigned int i, unsigned int j) - { assert(i&ar) - { - for(unsigned int i=0;i class Array4 -{ - private: - Array2< Array2* > A; - int M; - T init; - public: - Array4(int m,const T&_init) - : A(m,m,0),M(m),init(_init) {} - ~Array4() - { - for(int l=0;l(max(l+1,m+1),max(l+1,m+1),init); - } - return (*A(l,m))(i,j); - } - void clear() - { - for(int l=0;l&a=*A(l,m); - for(int i=0;i<=l;++i) - for(int j=0;j<=m;++j) - a(i,j)=0.0; - } - } -}; - -#endif diff --git a/ext/giza-pp/GIZA++-v2/D4Tables.h b/ext/giza-pp/GIZA++-v2/D4Tables.h deleted file mode 100644 index e047bcc9..00000000 --- a/ext/giza-pp/GIZA++-v2/D4Tables.h +++ /dev/null @@ -1,460 +0,0 @@ -/* - -Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef _d4tables_h_define -#define _d4tables_h_define -#include -#include "WordClasses.h" -#include "Globals.h" -#include "myassert.h" - -extern float d4modelsmooth_factor; - -class m4_key -{ - public: - int deps; - int l; - int m; - int F; - int E; - int prevj; - int vacancies1,vacancies2; - m4_key(int _deps,int _l,int _m,int _F,int _E,int _prevj,int _v1,int _v2) - : deps(_deps),l(_l),m(_m),F(_F),E(_E),prevj(_prevj),vacancies1(_v1),vacancies2(_v2) {} - friend ostream&print1(ostream&out,const m4_key&x,const WordClasses&wce,const WordClasses&wcf) - { - if(x.deps&DEP_MODEL_l)out << "l: " << x.l<<' '; - if(x.deps&DEP_MODEL_m)out << "m: " << x.m<<' '; - if(x.deps&DEP_MODEL_F)out << "F: " << wcf.classString(x.F)<< ' '; - if(x.deps&DEP_MODEL_E)out << "E: " << wce.classString(x.E)<< ' '; - // if(x.deps&DEP_MODEL_pj)out << "j-1: " << x.prevj<<' '; - if(x.vacancies1!=-1)out << "v1: " << x.vacancies1 << ' '; - if(x.vacancies2!=-1)out << "v2: " << x.vacancies2 << ' '; - return out << '\n'; - } - friend ostream&print1_m5(ostream&out,const m4_key&x,const WordClasses&wce,const WordClasses&wcf) - { - out << ((x.deps&DEP_MODEL_E)?wce.classString(x.E):string("0"))<< ' '; - out << ((x.deps&DEP_MODEL_F)?wcf.classString(x.F):string("0"))<< ' '; - out << x.vacancies1 << ' '; - out << x.vacancies2 << ' '; - return out; - } - friend ostream&printb1(ostream&out,const m4_key&x,const WordClasses&wce,const WordClasses&wcf) - { - if(x.deps&DEP_MODELb_l)out << "l: " << x.l<<' '; - if(x.deps&DEP_MODELb_m)out << "m: " << x.m<<' '; - if(x.deps&DEP_MODELb_F)out << "F: " << wcf.classString(x.F) << ' '; - if(x.deps&DEP_MODELb_E)out << "E: " << wce.classString(x.E) << ' '; - if(x.vacancies1!=-1)out << "v1: " << x.vacancies1 << ' '; - if(x.vacancies2!=-1)out << "v2: " << x.vacancies2 << ' '; - return out << '\n'; - } - friend ostream&printb1_m5(ostream&out,const m4_key&x,const WordClasses&wcf) - { - out << "-1 " << ((x.deps&DEP_MODEL_F)?wcf.classString(x.F):string("0"))<< ' '; - out << x.vacancies1 << ' '; - out << x.vacancies2 << ' '; - return out; - } -}; - -class compare1 -{ - private: - int deps; - public: - compare1(int _deps) : deps(_deps) {} - bool operator()(const m4_key&a,const m4_key&b)const - { - if(deps&DEP_MODEL_l){if( a.l&out) -{ - string s; - istringstream l(in); - while(l>>s) - out.push_back(s); -} - -class d4model -{ - public: - typedef Vector > Vpff; - map D1; - map Db1; - PositionIndex msl; - WordClasses ewordclasses; - WordClasses fwordclasses; - template - void makeWordClasses(const MAPPER&m1,const MAPPER&m2,string efile,string ffile) - { - ifstream estrm(efile.c_str()),fstrm(ffile.c_str()); - if( !estrm ) - { - cerr << "ERROR: can not read " << efile << endl; - } - else - ewordclasses.read(estrm,m1); - if( !fstrm ) - cerr << "ERROR: can not read " << ffile << endl; - else - fwordclasses.read(fstrm,m2); - } - d4model(PositionIndex _msl) - : D1(compare1(M4_Dependencies)),Db1(compareb1(M4_Dependencies)),msl(_msl) - {} - COUNT&getCountRef_first(WordIndex j,WordIndex j_cp,int E,int F,int l,int m) - { - assert(j>=1); - m4_key key(M4_Dependencies,l,m,F,E,j_cp,-1,-1); - map::iterator p=D1.find(key); - if(p==D1.end())p=D1.insert(make_pair(key,Vpff(msl*2+1,pair(0.0,0.0)))).first; - assert(p!=D1.end()); - return (p->second)[j-j_cp+msl].first; - } - COUNT&getCountRef_bigger(WordIndex j,WordIndex j_prev,int E,int F,int l,int m) - { - assert(j>=1); - assert(j_prev>=1); - m4_key key(M4_Dependencies,l,m,F,E,j_prev,-1,-1); - map::iterator p=Db1.find(key); - if(p==Db1.end())p=Db1.insert(make_pair(key,Vpff(msl*2+1,pair(0.0,0.0)))).first; - assert(p!=Db1.end()); - return (p->second)[j-j_prev+msl].first; - } - map::const_iterator getProb_first_iterator(int E,int F,int l,int m)const - {return D1.find(m4_key(M4_Dependencies,l,m,F,E,0,-1,-1));} - PROB getProb_first_withiterator(WordIndex j,WordIndex j_cp,int m,const map::const_iterator& p)const - { - assert(j>=1);//assert(j_cp>=0); - assert(j<=msl);assert(j_cp<=msl); - if(p==D1.end()) - { - return PROB_SMOOTH; - } - else - { - massert((p->second)[j-j_cp+msl].second<=1.0); - return max(PROB_SMOOTH,d4modelsmooth_factor/(2*m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_cp+msl].second); - } - } - PROB getProb_first(WordIndex j,WordIndex j_cp,int E,int F,int l,int m)const - { - assert(j>=1);//assert(j_cp>=0); - assert(j<=msl);assert(j_cp<=msl); - m4_key key(M4_Dependencies,l,m,F,E,j_cp,-1,-1); - map::const_iterator p=D1.find(key); - if(p==D1.end()) - { - return PROB_SMOOTH; - } - else - { - massert((p->second)[j-j_cp+msl].second<=1.0); - return max(PROB_SMOOTH,d4modelsmooth_factor/(2*m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_cp+msl].second); - } - } - map::const_iterator getProb_bigger_iterator(int E,int F,int l,int m)const - { - return Db1.find(m4_key(M4_Dependencies,l,m,F,E,0,-1,-1)); - } - PROB getProb_bigger_withiterator(WordIndex j,WordIndex j_prev,int m,const map::const_iterator&p)const - { - massert(j>=1);massert(j_prev>=1); - massert(j>j_prev); - massert(j<=msl);massert(j_prev<=msl); - if(p==Db1.end()) - { - return PROB_SMOOTH; - } - else - { - massert((p->second)[j-j_prev+msl].second<=1.0 ); - return max(PROB_SMOOTH,d4modelsmooth_factor/(m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_prev+msl].second); - } - } - - PROB getProb_bigger(WordIndex j,WordIndex j_prev,int E,int F,int l,int m)const - { - massert(j>=1);massert(j_prev>=1); - massert(j>j_prev); - massert(j<=msl);massert(j_prev<=msl); - m4_key key(M4_Dependencies,l,m,F,E,j_prev,-1,-1); - map::const_iterator p=Db1.find(key); - if(p==Db1.end()) - { - return PROB_SMOOTH; - } - else - { - massert((p->second)[j-j_prev+msl].second<=1.0 ); - return max(PROB_SMOOTH,d4modelsmooth_factor/(m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_prev+msl].second); - } - } - void normalizeTable() - { - int nParams=0; - for(map::iterator i=D1.begin();i!=D1.end();++i) - { - Vpff&d1=i->second; - double sum=0.0; - for(PositionIndex i=0;i::iterator i=Db1.begin();i!=Db1.end();++i) - { - Vpff&db1=i->second; - double sum=0.0; - for(PositionIndex i=0;i::iterator i=D1.begin();i!=D1.end();++i) - { - Vpff&d1=i->second; - for(PositionIndex i=0;i::iterator i=Db1.begin();i!=Db1.end();++i) - { - Vpff&db1=i->second; - for(PositionIndex i=0;i::const_iterator i=D1.begin();i!=D1.end();++i) - { - const Vpff&d1=i->second; - double sum=0.0; - for(PositionIndex ii=0;iifirst,ewordclasses,fwordclasses); - out << "SUM: " << sum << ' '<< '\n'; - for(unsigned ii=0;ii::const_iterator i=Db1.begin();i!=Db1.end();++i) - { - const Vpff&db1=i->second; - double sum=0.0; - for(PositionIndex ii=0;iifirst,ewordclasses,fwordclasses); - out << "SUM: " << sum << ' '<<'\n'; - for(unsigned ii=0;ii::const_iterator i=D1.begin();i!=D1.end();++i) - { - const Vpff&d1=i->second; - for(unsigned ii=0;iifirst.E) << ' ' << fwordclasses.classString(i->first.F) << ' ' << (int)(ii)-(int)(msl) << ' ' << d1[ii].second << '\n'; - } - for(map::const_iterator i=Db1.begin();i!=Db1.end();++i) - { - const Vpff&db1=i->second; - for(unsigned ii=0;iifirst.F) << ' ' << (int)(ii)-(int)(msl) << ' ' << db1[ii].second << '\n'; - } - } - } - bool readProbTable(const char *fname) - { - cerr << "Reading D4Tables from " << fname << endl; - ifstream file(fname); - string line; - do - { - getline(file,line); - } while(line.length()&&line[0]=='#'); - - do - { - while(line.length()==0) - getline(file,line); - if( line[0]=='#') - break; - Vector linestr; - tokenize(line,linestr); - m4_key k(M4_Dependencies,0,0,0,0,0,-1,-1); - for(unsigned int i=0;i> str >> sum; - iassert(str=="SUM:"); - if( str!="SUM:") - cerr << "ERROR: string is " << str << " and not sum " << endl; - - do - { - int value; - double count; - getline(file,line); - istringstream twonumbers(line); - if(twonumbers >> value >> count) - { - if( D1.count(k)==0 ) - D1.insert(make_pair(k,Vpff(msl*2+1,pair(0.0,0.0)))); - D1[k][value+msl]=make_pair(count,count/sum); - } - }while(line.length()); - }while(file); - do - { - getline(file,line); - } while(line.length()&&line[0]=='#'); - do - { - while(line.length()==0) - getline(file,line); - if( line[0]=='#') - break; - Vector linestr; - tokenize(line,linestr); - m4_key k(M4_Dependencies,0,0,0,0,0,-1,-1); - bool sumRead=0; - for(unsigned int i=0;i> str >> sum; - else - { - str=linestr[0]; - sum=atof(linestr[1].c_str()); - } - if( str!="SUM:" ) - cerr << "ERROR: should read SUM but read " << str << endl; - do - { - int value; - double count; - getline(file,line); - istringstream twonumbers(line); - if(twonumbers >> value >> count) - { - if( Db1.count(k)==0 ) - Db1.insert(make_pair(k,Vpff(msl*2+1,pair(0.0,0.0)))); - Db1[k][value+msl]=make_pair(count,count/sum); - } - }while(file&&line.length()); - }while(file); - return 1; - } -}; - -#endif diff --git a/ext/giza-pp/GIZA++-v2/D5Tables.h b/ext/giza-pp/GIZA++-v2/D5Tables.h deleted file mode 100644 index c69992b0..00000000 --- a/ext/giza-pp/GIZA++-v2/D5Tables.h +++ /dev/null @@ -1,235 +0,0 @@ -/* - -Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef _d5tables_h_define -#define _d5tables_h_define -#include -#include "D4Tables.h" - -extern float d5modelsmooth_countoffset; -extern float d5modelsmooth_factor; - -#define UNSEENPROB (1.0/vacancies_total) - -class d5model -{ - private: - typedef Vector < pair < COUNT,PROB > >Vpff; - map< m4_key,Vpff,compare1 > D1; - map< m4_key,Vpff,compareb1 > Db1; - public: - d4model&d4m; - WordClasses ewordclasses,fwordclasses; - template - void makeWordClasses(const MAPPER&m1,const MAPPER&m2,string efile,string ffile) - { - ifstream estrm(efile.c_str()),fstrm(ffile.c_str()); - if( !estrm ) - cerr << "ERROR: can not read classes from " << efile << endl; - else - ewordclasses.read(estrm,m1); - if( !fstrm ) - cerr << "ERROR: can not read classes from " << ffile << endl; - else - fwordclasses.read(fstrm,m2); - } - d5model (d4model&_d4m) - :D1 (compare1(M5_Dependencies)), Db1 (compareb1(M5_Dependencies)),d4m(_d4m) - {} - COUNT &getCountRef_first (PositionIndex vacancies_j, - PositionIndex vacancies_jp, int F, - PositionIndex l, PositionIndex m, - PositionIndex vacancies_total) - { - massert(vacancies_j>0); - massert(vacancies_total>0); - //massert(vacancies_jp<=vacancies_total); - massert(vacancies_j <=vacancies_total); - massert(vacancies_total<=m); - m4_key key(M5_Dependencies,l,m,F,0,0,vacancies_jp,vacancies_total); - map::iterator p=D1.find(key); - if(p==D1.end()) - p=D1.insert(make_pair(key,Vpff(vacancies_total+1,make_pair(0,UNSEENPROB)))).first; // !!! constrain length - massert(p!=D1.end()); - return (p->second)[vacancies_j].first; - } - COUNT &getCountRef_bigger (PositionIndex vacancies_j, - PositionIndex vacancies_jp, int F, - PositionIndex l, PositionIndex m, - PositionIndex vacancies_total) - { - massert(vacancies_j>0); - massert(vacancies_total>0); - massert (vacancies_jp <= vacancies_j); - massert (vacancies_j-vacancies_jp <= vacancies_total); - m4_key key(M5_Dependencies,l,m,F,0,0,-1,vacancies_total); - map::iterator p=Db1.find(key); - if(p==Db1.end()) - p=Db1.insert(make_pair(key,Vpff(vacancies_total+1,make_pair(0,UNSEENPROB)))).first; // !!! constrain length - massert(p!=Db1.end()); - return (p->second)[vacancies_j - vacancies_jp].first; - } - PROB getProb_first (PositionIndex vacancies_j, PositionIndex vacancies_jp, - int F, PositionIndex l, PositionIndex m, - PositionIndex vacancies_total) const - { - massert(vacancies_j>0); - massert(vacancies_total>0); - //massert(vacancies_jp<=vacancies_total); - massert(vacancies_j <=vacancies_total); - massert(vacancies_total<=m); - m4_key key(M5_Dependencies,l,m,F,0,0,vacancies_jp,vacancies_total); - map::const_iterator p=D1.find(key); - if( p==D1.end() ) - return UNSEENPROB; - else - return max(PROB_SMOOTH,d5modelsmooth_factor/(vacancies_total)+(1-d5modelsmooth_factor)*(p->second)[vacancies_j].second); - } - PROB getProb_bigger (PositionIndex vacancies_j, PositionIndex vacancies_jp, - int F, PositionIndex l, PositionIndex m, - PositionIndex vacancies_total) const - { - massert(vacancies_j>0); - massert(vacancies_total>0); - massert (vacancies_jp <= vacancies_j); - massert (vacancies_j-vacancies_jp <= vacancies_total); - m4_key key(M5_Dependencies,l,m,F,0,0,-1,vacancies_total); - map::const_iterator p=Db1.find(key); - if(p==Db1.end()) - return UNSEENPROB; - else - return max(PROB_SMOOTH,d5modelsmooth_factor/(vacancies_total)+(1-d5modelsmooth_factor)*(p->second)[vacancies_j - vacancies_jp].second); - } - void normalizeTable () - { - int nParams=0; - for(map::iterator i=D1.begin();i!=D1.end();++i) - { - Vpff&d1=i->second; - COUNT sum=0.0; - for(PositionIndex i=0;i::iterator i=Db1.begin();i!=Db1.end();++i) - { - Vpff&db1=i->second; - double sum=0.0; - for(PositionIndex i=0;i::const_iterator i=d5m.D1.begin();i!=d5m.D1.end();++i) - { - const Vpff&d1=i->second; - COUNT sum=0.0; - for(PositionIndex ii=0;iifirst,d5m.ewordclasses,d5m.fwordclasses); - out << (int)(ii) << ' ' << d1[ii].second << ' ' << d1[ii].first << '\n'; - } - out << endl; - } - } - out << "# Table for non-head of cept.\n"; - for(map::const_iterator i=d5m.Db1.begin();i!=d5m.Db1.end();++i) - { - const Vpff&db1=i->second; - double sum=0.0; - for(PositionIndex ii=0;iifirst,d5m.fwordclasses); - out << (int)(ii) << ' ' << db1[ii].second << ' ' << db1[ii].first << '\n'; - } - out << endl; - } - } - return out; - } - void readProbTable(const char*x) - { - ifstream f(x); - string l; - while(getline(f,l)) - { - if(l.length()&&l[0]=='#') - continue; - istringstream is(l.c_str()); - string E,F; - int v1,v2,ii; - double prob,count; - if(is>>E>>F>>v1>>v2>>ii>>prob>>count) - { - //cerr << "Read: " << E << " " << F << " " << v1 << " " << v2 << " " << prob<< endl; - if( count>0 ) - if( E=="-1") - getCountRef_bigger(ii,0,fwordclasses(F),1000,1000,v2)+=count; - else - getCountRef_first(ii,v1,fwordclasses(F),1000,1000,v2)+=count; - } - } - normalizeTable(); - ofstream of("M5FILE"); - of << (*this); - } - void clear() - { - for(map::iterator i=D1.begin();i!=D1.end();++i) - { - Vpff&d1=i->second; - for(PositionIndex i=0;i::iterator i=Db1.begin();i!=Db1.end();++i) - { - Vpff&db1=i->second; - for(PositionIndex i=0;i - -Dictionary::Dictionary(const char *filename){ - if(!strcmp(filename, "")){ - dead = true; - return; - } - dead = false; - cout << "Reading dictionary from: " << filename << '\n'; - ifstream dFile(filename); - if(!dFile){ - cerr << "ERROR: Can't open dictionary: " << filename << '\n'; - exit(1); - } - - currindexmin = 0; - currindexmax = 0; - currval = 0; - int p, q; - while((dFile >> p >> q)){ - pairs[0].push_back(p); - pairs[1].push_back(q); - } - cout << "Dictionary read; " << pairs[0].size() << " pairs loaded." << '\n'; - dFile.close(); -} - - -bool Dictionary::indict(int p, int q){ - if(dead) return false; - if(p == 0 && q == 0) return false; - if(currval == p){ - for(int i = currindexmin; i <= currindexmax; i++) - if(pairs[1][i] == q) return true; - return false; - } - else{ - int begin = 0, end = pairs[0].size() - 1, middle = 0; - unsigned int t; - bool ret = false; - while(begin <= end){ - middle = begin + ((end - begin) >> 1); - if(p < pairs[0][middle]) end = middle - 1; - else if(p > pairs[0][middle]) begin = middle + 1; - else{ - break; - } - } - t = middle; - while(pairs[0][t] == p ) - if(pairs[1][t--] == q) ret = true; - currindexmin = t + 1; - t = middle + 1; - while(pairs[0][t] == p && t < pairs[0].size()) - if(pairs[1][t++] == q) ret = true; - currindexmax = t - 1; - currval = p; - return ret; - } -} - - diff --git a/ext/giza-pp/GIZA++-v2/Dictionary.h b/ext/giza-pp/GIZA++-v2/Dictionary.h deleted file mode 100644 index 3a5c71ec..00000000 --- a/ext/giza-pp/GIZA++-v2/Dictionary.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/* Noah A. Smith - Dictionary object for dictionary filter in Model 1 training - - 9 August 1999 -*/ - -#include -#include - -#include "Vector.h" - -#ifndef DICTIONARY_H -#define DICTIONARY_H - -class Dictionary{ - private: - Vector pairs[2]; - int currval; - int currindexmin; - int currindexmax; - bool dead; - public: - Dictionary(const char *); - bool indict(int, int); -}; - -#endif diff --git a/ext/giza-pp/GIZA++-v2/FlexArray.h b/ext/giza-pp/GIZA++-v2/FlexArray.h deleted file mode 100644 index c7365f75..00000000 --- a/ext/giza-pp/GIZA++-v2/FlexArray.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - -Copyright (C) 1988,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef CLASS_FlexArray_defined -#define CLASS_FlexArray_defined -#include "Array.h" -#include -#include -template -class FlexArray -{ -private: - Array p; - int start,End; -public: - FlexArray(int _start=0,int _end=-1) - : p(_end-_start+1),start(_start),End(_end) {} - FlexArray(int _start,int _end,const T&init) - : p(_end-_start+1,init),start(_start),End(_end) {} - T&operator[](int i) - {return p[i-start];} - const T&operator[](int i)const - {return p[i-start];} - int low()const{return start;} - int high()const{return End;} - T*begin(){return conv(p.begin());} - T*end(){return conv(p.end());} -}; - -template -inline ostream&operator<<(ostream&out,const FlexArray&x) -{ - for(int i=x.low();i<=x.high();++i) - out << i << ':' << x[i] << ';' << ' '; - return out; -} - - -#endif diff --git a/ext/giza-pp/GIZA++-v2/ForwardBackward.cpp b/ext/giza-pp/GIZA++-v2/ForwardBackward.cpp deleted file mode 100644 index 969316ac..00000000 --- a/ext/giza-pp/GIZA++-v2/ForwardBackward.cpp +++ /dev/null @@ -1,242 +0,0 @@ -/* - -Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef NO_TRAINING -#include "ForwardBackward.h" -#include "Globals.h" -#include "myassert.h" -#include "HMMTables.h" -#include "mymath.h" - - -double ForwardBackwardTraining(const HMMNetwork&net,Array&g,Array >&E){ - const int I=net.size1(),J=net.size2(),N=I*J; - Array alpha(N,0),beta(N,0),sum(J); - for(int i=0;i(beta.begin())+N-I-1; - for(int j=J-2;j>=0;--j) - for(int ti=I-1;ti>=0;--ti,--cur_beta) { - const double *next_beta=conv(beta.begin())+(j+1)*I; - const double *alprob=&net.outProb(j,ti,0),*next_node=&net.nodeProb(0,j+1); - for(int ni=0;ni(alpha.begin())+I; - cur_beta=conv(beta.begin())+I; - for(int j=1;j&e=E[ (E.size()==1)?0:(j-1) ]; - if( (E.size()!=1) || j==1 ) - { - e.resize(I,I); - fill(e.begin(),e.end(),0.0); - } - - for(int ti=0;ti(alpha.begin())+I*(j-1); - double *cur_e= &e(ti,0); - double this_node=net.nodeProb(ti,j); - const double* alprob= &net.outProb(j-1,0,ti); - for(int pi=0;pi()); - double bsum=0,esum=0,esum2; - for(int i=0;i&e=E[j]; - const double *epe=e.end(); - for(const double*ep=e.begin();ep!=epe;++ep) - esum+=*ep; - } - if( J>1 ) - esum2=esum/(J-1); - else - esum2=0.0; - if(!(esum2==0.0||mfabs(esum2-bsum)/bsum<1e-3*I)) - cout << "ERROR2: " << esum2 <<" " <(sum.begin()); - double* ge=conv(g.end()); - for(double* gp=conv(g.begin());gp!=ge;gp+=I) - { - *sumptr++=normalize_if_possible(gp,gp+I); - if(bsum && !(mfabs((*(sumptr-1)-bsum)/bsum)<1e-3*I)) - cout << "ERROR: " << *(sumptr-1) << " " << bsum << " " << mfabs((*(sumptr-1)-bsum)/bsum) << ' ' << I << ' ' << J << endl; - } - for(unsigned int j=0;j<(unsigned int)E.size();j++) - { - Array2&e=E[j]; - double* epe=e.end(); - if( esum ) - for(double*ep=e.begin();ep!=epe;++ep) - *ep/=esum; - else - for(double*ep=e.begin();ep!=epe;++ep) - *ep/=1.0/(max(I*I,I*I*(J-1))); - } - if( sum.size() ) - return sum[0]; - else - return 1.0; -} -void HMMViterbi(const HMMNetwork&net,Array&vit) { - const int I=net.size1(),J=net.size2(); - vit.resize(J); - Arrayg; - Array >e(1); - ForwardBackwardTraining(net,g,e); - for(int j=0;j(g.begin())+I*j; - vit[j]=max_element(begin,begin+I)-begin; - } -} -void HMMViterbi(const HMMNetwork&net,Array&g,Array&vit) { - const int I=net.size1(),J=net.size2(); - vit.resize(J); - for(int j=0;j(g.begin())+I*j; - vit[j]=max_element(begin,begin+I)-begin; - } -} - -double HMMRealViterbi(const HMMNetwork&net,Array&vitar,int pegi,int pegj,bool verbose){ - const int I=net.size1(),J=net.size2(),N=I*J; - Array alpha(N,-1); - Array bp(N,(double*)0); - vitar.resize(J); - if( J==0 ) - return 1.0; - for(int i=0;iI/2 ) - alpha[i]=0; // only first empty word can be chosen - bp[i]=0; - } - double *cur_alpha=conv(alpha.begin())+I; - double **cur_bp=conv(bp.begin())+I; - for(int j=1;j(alpha.begin())+I*(j-1); - double this_node=net.nodeProb(ti,j); - const double *alprob= &net.outProb(j-1,0,ti); - for(int pi=0;pi *cur_alpha ) - { - (*cur_alpha)=alpha_increment; - (*cur_bp)=prev_alpha; - } - } - } - } - for(int i=0;i(alpha.begin())+j*I; - vitar[J-1]=max_element(cur_alpha,cur_alpha+I)-cur_alpha; - double ret= *max_element(cur_alpha,cur_alpha+I); - while(bp[vitar[j]+j*I]) - { - cur_alpha-=I; - vitar[j-1]=bp[vitar[j]+j*I]-cur_alpha; - massert(vitar[j-1]=0); - j--; - } - massert(j==0); - if( verbose ) - { - cout << "VERB:PEG: " << pegi << ' ' << pegj << endl; - for(int j=0;j&g,Array >&E){ - Array vitar; - double ret=HMMRealViterbi(net,vitar); - const int I=net.size1(),J=net.size2(); - if( E.size()==1 ) - { - Array2&e=E[0]; - e.resize(I,I); - g.resize(I*J); - fill(g.begin(),g.end(),0.0); - fill(e.begin(),e.end(),0.0); - for(int i=0;i0 ) - e(vitar[i],vitar[i-1])++; - } - } - else - { - g.resize(I*J); - fill(g.begin(),g.end(),0.0); - for(int i=0;i0 ) - { - Array2&e=E[i-1]; - e.resize(I,I); - fill(e.begin(),e.end(),0.0); - e(vitar[i],vitar[i-1])++; - } - } - } - return ret; -} - -#endif - diff --git a/ext/giza-pp/GIZA++-v2/ForwardBackward.h b/ext/giza-pp/GIZA++-v2/ForwardBackward.h deleted file mode 100644 index 42449d3d..00000000 --- a/ext/giza-pp/GIZA++-v2/ForwardBackward.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - -Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef NO_EM_MARKOF_ZEUGS_DEFINED -#define NO_EM_MARKOF_ZEUGS_DEFINED -#ifndef NO_TRAINING -#include "myassert.h" -#include "Array.h" -#include "Array2.h" - -class HMMNetwork -{ - public: - int as,bs; - Array2 n; - Array > e; - Array alphainit; - Array betainit; - int ab; - double finalMultiply; - HMMNetwork(int I,int J) - : as(I),bs(J),n(as,bs),/*e(as,as,0.0),*/e(0),alphainit(as,1.0/as),betainit(as,1.0),ab(as*bs),finalMultiply(1.0) - {} - double getAlphainit(int i)const{return alphainit[i];} - double getBetainit(int i)const{return betainit[i];} - inline int size1()const{return as;} - inline int size2()const{return bs;} - inline const double&nodeProb(int i,int j)const - {return n(i,j);} - inline const double&outProb(int j,int i1,int i2)const - {/*massert(e[min(int(e.size())-1,j)](i1,i2) );*/ return e[min(int(e.size())-1,j)](i1,i2);} - friend ostream&operator<<(ostream&out,const HMMNetwork&x) - { - return out <<"N: \n"<< x.n << endl << "E: \n" << x.e << "A:\n" << x.alphainit << "B:\n" << x.betainit << endl; - } -}; -double ForwardBackwardTraining(const HMMNetwork&mc,Array&gamma,Array >&epsilon); -void HMMViterbi(const HMMNetwork&mc,Array&vit); -double HMMRealViterbi(const HMMNetwork&net,Array&vit,int pegi=-1,int pegj=-1,bool verbose=0); -double MaximumTraining(const HMMNetwork&net,Array&g,Array >&e); -void HMMViterbi(const HMMNetwork&net,Array&g,Array&vit); -#endif -#endif diff --git a/ext/giza-pp/GIZA++-v2/GNU.GPL b/ext/giza-pp/GIZA++-v2/GNU.GPL deleted file mode 100644 index 5b2225e4..00000000 --- a/ext/giza-pp/GIZA++-v2/GNU.GPL +++ /dev/null @@ -1,282 +0,0 @@ - - -Preamble - -The licenses for most software are designed to take away your freedom -to share and change it. By contrast, the GNU General Public License is -intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Library General Public License instead.) You can apply it to -your programs, too. - -When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - -To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the software, or if you modify it. - -For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - -We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - -Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, -we want its recipients to know that what they have is not the -original, so that any problems introduced by others will not reflect -on the original authors' reputations. - -Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at -all. - -The precise terms and conditions for copying, distribution and -modification follow. - - -TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - -0. This License applies to any program or other work which contains a -notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the Program -(independent of having been made by running the Program). Whether that -is true depends on what the Program does. - -1. You may copy and distribute verbatim copies of the Program's source -code as you receive it, in any medium, provided that you conspicuously -and appropriately publish on each copy an appropriate copyright notice -and disclaimer of warranty; keep intact all the notices that refer to -this License and to the absence of any warranty; and give any other -recipients of the Program a copy of this License along with the -Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a -fee. - -2. You may modify your copy or copies of the Program or any portion of -it, thus forming a work based on the Program, and copy and distribute -such modifications or work under the terms of Section 1 above, -provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that - in whole or in part contains or is derived from the Program or - any part thereof, to be licensed as a whole at no charge to all - third parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you - provide a warranty) and that users may redistribute the program - under these conditions, and telling the user how to view a copy - of this License. (Exception: if the Program itself is interactive - but does not normally print such an announcement, your work based - on the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - -3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of - Sections 1 and 2 above on a medium customarily used for software - interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - -4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt otherwise -to copy, modify, sublicense or distribute the Program is void, and -will automatically terminate your rights under this License. However, -parties who have received copies, or rights, from you under this -License will not have their licenses terminated so long as such -parties remain in full compliance. - -5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - -6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted -herein. You are not responsible for enforcing compliance by third -parties to this License. - - -7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - -8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - -9. The Free Software Foundation may publish revised and/or new -versions of the General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Program does not specify a -version number of this License, you may choose any version ever -published by the Free Software Foundation. - -10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the -author to ask for permission. For software which is copyrighted by the -Free Software Foundation, write to the Free Software Foundation; we -sometimes make exceptions for this. Our decision will be guided by the -two goals of preserving the free status of all derivatives of our free -software and of promoting the sharing and reuse of software generally. - -NO WARRANTY - -11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE -LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS -AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF -ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - -12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - -END OF TERMS AND CONDITIONS diff --git a/ext/giza-pp/GIZA++-v2/Globals.h b/ext/giza-pp/GIZA++-v2/Globals.h deleted file mode 100644 index fc2953c8..00000000 --- a/ext/giza-pp/GIZA++-v2/Globals.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef Globals_asdf_defined -#define Globals_asdf_defined -#include -#include -#include -#include "defs.h" -#include "Vector.h" - -extern float PROB_SMOOTH; -extern bool Verbose, Log, Peg, Transfer, Transfer2to3, useDict ; -extern string Prefix, LogFilename, OPath, - SourceVocabFilename, TargetVocabFilename, CorpusFilename, TestCorpusFilename, - t_Filename, a_Filename, p0_Filename, d_Filename, n_Filename, dictionary_Filename; -extern ofstream logmsg ; -extern double M5P0,P0 ; -extern bool NODUMPS, FEWDUMPS ; -extern string Usage ; -extern unsigned int MAX_SENTENCE_LENGTH ; -extern int PegUntil; - -extern short DeficientDistortionForEmptyWord; - -extern int M4_Dependencies; -extern int M5_Dependencies; - -extern short OutputInAachenFormat; - -#define DEP_MODEL_l 1 -#define DEP_MODEL_m 2 -#define DEP_MODEL_F 4 -#define DEP_MODEL_E 8 - -#define DEP_MODELb_l 16 -#define DEP_MODELb_m 32 -#define DEP_MODELb_F 64 -#define DEP_MODELb_E 128 - -#define DEP_SUM 256 - -class vcbList; - -extern vcbList *globeTrainVcbList, *globfTrainVcbList; - -extern short PredictionInAlignments; -extern short SmoothHMM; -#define VERB Verbose - -double ErrorsInAlignment(const map< pair,char >&reference,const Vector&test,int l,int&missing,int&toomuch,int&eventsMissing,int&eventsToomuch,int); -extern Vector,char > > ReferenceAlignment; -void printGIZAPars(ostream&out); - -#endif diff --git a/ext/giza-pp/GIZA++-v2/HMMTables.cpp b/ext/giza-pp/GIZA++-v2/HMMTables.cpp deleted file mode 100644 index f0372897..00000000 --- a/ext/giza-pp/GIZA++-v2/HMMTables.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - -Copyright (C) 1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "HMMTables.h" -#include -#include "Globals.h" -#include "Parameter.h" - -template -void HMMTables::writeJumps(ostream&out) const -{ - double ssum=0.0; - for(typename map,FlexArray >::const_iterator i=alProb.begin();i!=alProb.end();++i) - { - double sum=0.0; - out << "\n\nDistribution for: "; - printAlDeps(out,i->first,*mapper1,*mapper2); - out << ' '; - for(int a=i->second.low();a<=i->second.high();++a) - if( i->second[a] ) - { - out << a << ':' << i->second[a] << ';' << ' '; - sum+=i->second[a]; - } - out << '\n' << '\n'; - out << "SUM: " << sum << '\n'; - ssum+=sum; - } - out << "FULL-SUM: " << ssum << '\n'; -} -template -void HMMTables::readJumps(istream&) -{ -} -template -double HMMTables::getAlProb(int istrich,int k,int sentLength,int J,CLS w1,CLS w2,int j,int iter) const -{ - massert(k=0); - massert(istrich=-1); - int pos=istrich-k; - switch(PredictionInAlignments) - { - case 0: pos=istrich-k; break; - case 1: pos=k; break; - case 2: - pos=(k*J-j*sentLength); - if( pos>0 ) pos+=J/2; else pos-=J/2; - pos/=J; - break; - default:abort(); - } - typename map,FlexArray >::const_iterator p=alProb.find(AlDeps(sentLength,istrich,j,w1,w2)); - if( p!=alProb.end() ) - { - return (p->second)[pos]; - } - else - { - if( iter>0&&iter<5000 ) - cout << "WARNING: Not found: " << ' ' << J << ' ' << sentLength << '\n';; - return 1.0/(2*sentLength-1); - } -} - -template -void HMMTables::addAlCount(int istrich,int k,int sentLength,int J,CLS w1,CLS w2,int j,double value,double valuePredicted) -{ - int pos=istrich-k; - switch(PredictionInAlignments) - { - case 0: pos=istrich-k; break; - case 1: pos=k; break; - case 2: - pos=(k*J-j*sentLength); - if( pos>0 ) pos+=J/2; else pos-=J/2; - pos/=J; - break; - default:abort(); - } - AlDeps deps(AlDeps(sentLength,istrich,j,w1,w2)); - - { - typename map,FlexArray >::iterator p=alProb.find(deps); - if( p==alProb.end() ) - { - if( (CompareAlDeps&1)==0 ) - p=alProb.insert(make_pair(deps,FlexArray (-MAX_SENTENCE_LENGTH,MAX_SENTENCE_LENGTH,0.0))).first; - else - p=alProb.insert(make_pair(deps,FlexArray (-sentLength,sentLength,0.0))).first; - } - p->second[pos]+=value; - } - - if( valuePredicted ) - { - typename map,FlexArray >::iterator p=alProbPredicted.find(deps); - if( p==alProbPredicted.end() ) - { - if( (CompareAlDeps&1)==0 ) - p=alProbPredicted.insert(make_pair(deps,FlexArray (-MAX_SENTENCE_LENGTH,MAX_SENTENCE_LENGTH,0.0))).first; - else - p=alProbPredicted.insert(make_pair(deps,FlexArray (-sentLength,sentLength,0.0))).first; - } - p->second[pos]+=valuePredicted; - } -} - -template -Array&HMMTables::doGetAlphaInit(int I) -{ - if( !init_alpha.count(I) ) - init_alpha[I]=Array(I,0); - return init_alpha[I]; -} -template -Array&HMMTables::doGetBetaInit(int I) -{ - if( !init_beta.count(I) ) - init_beta[I]=Array(I,0); - return init_beta[I]; -} - -template -bool HMMTables::getAlphaInit(int I,Array&x)const -{ - hash_map >::const_iterator i=init_alpha.find(I); - if( i==init_alpha.end() ) - return 0; - else - { - x=i->second; - for(unsigned int j=x.size()/2+1;j -bool HMMTables::getBetaInit(int I,Array&x)const -{ - hash_map >::const_iterator i=init_beta.find(I); - if( i==init_beta.end() ) - return 0; - else - { - x=i->second; - return 1; - } -} - -template -HMMTables:: HMMTables(double _probForEmpty,const MAPPERCLASSTOSTRING&m1,const MAPPERCLASSTOSTRING&m2): - probabilityForEmpty(mfabs(_probForEmpty)), - updateProbabilityForEmpty(_probForEmpty<0.0), - mapper1(&m1), - mapper2(&m2) -{} -template -HMMTables::~HMMTables() {} diff --git a/ext/giza-pp/GIZA++-v2/HMMTables.h b/ext/giza-pp/GIZA++-v2/HMMTables.h deleted file mode 100644 index 051bd0a7..00000000 --- a/ext/giza-pp/GIZA++-v2/HMMTables.h +++ /dev/null @@ -1,172 +0,0 @@ -/* - -Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef HMM_TABLES_H_ASDF_DEFINED -#define HMM_TABLES_H_ASDF_DEFINED -#include "FlexArray.h" - -#include "Array.h" -#include -#include "mymath.h" - -template -T normalize_if_possible(T*a,T*b) -{ - T sum=0; - for(T*i=a;i!=b;++i) - sum+=*i; - if( sum ) - for(T*i=a;i!=b;++i) - *i/=sum; - else - fill(a,b,1.0/(b-a)); - return sum; -} - -extern short CompareAlDeps; -template -class AlDeps -{ - public: - int englishSentenceLength; - CLS classPrevious; - int previous; - int j; - CLS Cj; - AlDeps(int l,int p=0,int _j=0,CLS s1=0,CLS _Cj=0) - : englishSentenceLength(l),classPrevious(s1),previous(p),j(_j),Cj(_Cj) - {} - friend bool operator<(const AlDeps&x,const AlDeps&y) - { - if( (CompareAlDeps&1) && x.englishSentenceLength -class Hash_AlDeps -{ - public: - unsigned - int - operator() - (const AlDeps&x) - const - { - unsigned int hash=0; - if( (CompareAlDeps&1) ) { hash=hash+x.englishSentenceLength;hash*=31;} - if( (CompareAlDeps&2) ) { hash=hash+x.classPrevious;hash*=31;} - if( (CompareAlDeps&4) ) { hash=hash+x.previous;hash*=31;} - if( (CompareAlDeps&8) ) { hash=hash+x.j;hash*=31;} - if( (CompareAlDeps&16) ) { hash=hash+x.Cj;hash*=31;} - return hash; - - } -}; - -template -class HMMTables -{ - protected: - double probabilityForEmpty; - bool updateProbabilityForEmpty; - hash_map > init_alpha; - hash_map > init_beta; - map,FlexArray > alProb; - map,FlexArray > alProbPredicted; - int globalCounter; - double divSum; - double p0_count,np0_count; - const MAPPERCLASSTOSTRING*mapper1; - const MAPPERCLASSTOSTRING*mapper2; - public: - const HMMTables*getThis()const {return this;} - HMMTables(double _probForEmpty,const MAPPERCLASSTOSTRING&m1,const MAPPERCLASSTOSTRING&m2); - virtual ~HMMTables(); - virtual double getAlProb(int i,int k,int sentLength,int J,CLS w1,CLS w2,int j,int iter=0) const; - virtual void writeJumps(ostream&) const; - void addAlCount(int i,int k,int sentLength,int J,CLS w1,CLS w2,int j,double value,double valuePredicted); - virtual void readJumps(istream&); - virtual bool getAlphaInit(int I,Array&x)const; - virtual bool getBetaInit(int I,Array&x)const; - Array&doGetAlphaInit(int I); - Array&doGetBetaInit(int I); - virtual double getProbabilityForEmpty()const - {return probabilityForEmpty;} - void performGISIteration(const HMMTables*old) - { - cout << "OLDSIZE: " << (old?(old->alProb.size()):0) << " NEWSIZE:"<< alProb.size()<< endl; - for(typename map,FlexArray >::iterator i=alProb.begin();i!=alProb.end();++i) - { - if( alProbPredicted.count(i->first)) - { - normalize_if_possible(i->second.begin(),i->second.end()); - normalize_if_possible(alProbPredicted[i->first].begin(),alProbPredicted[i->first].end()); - for(int j=i->second.low();j<=i->second.high();++j) - { - if( i->second[j] ) - if(alProbPredicted[i->first][j]>0.0 ) - { - double op=1.0; - if( old && old->alProb.count(i->first) ) - op=(old->alProb.find(i->first)->second)[j]; - //cerr << "GIS: " << j << ' ' << " OLD:" - // << op << "*true:" - // << i->second[j] << "/pred:" << alProbPredicted[i->first][j] << " -> "; - i->second[j]= op*(i->second[j]/alProbPredicted[i->first][j]); - //cerr << i->second[j] << endl; - } - else - { - cerr << "ERROR2 in performGISiteration: " << i->second[j] << endl; - } - } - } - else - cerr << "ERROR in performGISIteration: " << alProbPredicted.count(i->first) << endl; - } - } -}; - -template -inline void printAlDeps(ostream&out,const AlDeps&x,const MAPPERCLASSTOSTRING&mapper1,const MAPPERCLASSTOSTRING&mapper2) -{ - if( (CompareAlDeps&1) ) out << "sentenceLength: " << x.englishSentenceLength<< ' '; - if( (CompareAlDeps&2) ) out << "previousClass: " << mapper1.classString(x.classPrevious) << ' '; - if( (CompareAlDeps&4) ) out << "previousPosition: " << x.previous << ' '; - if( (CompareAlDeps&8) ) out << "FrenchPosition: " << x.j << ' '; - if( (CompareAlDeps&16) ) out << "FrenchClass: " << mapper2.classString(x.Cj) << ' '; - //out << '\n'; -} - -#endif diff --git a/ext/giza-pp/GIZA++-v2/LICENSE b/ext/giza-pp/GIZA++-v2/LICENSE deleted file mode 100644 index 5b2225e4..00000000 --- a/ext/giza-pp/GIZA++-v2/LICENSE +++ /dev/null @@ -1,282 +0,0 @@ - - -Preamble - -The licenses for most software are designed to take away your freedom -to share and change it. By contrast, the GNU General Public License is -intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Library General Public License instead.) You can apply it to -your programs, too. - -When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - -To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the software, or if you modify it. - -For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - -We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - -Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, -we want its recipients to know that what they have is not the -original, so that any problems introduced by others will not reflect -on the original authors' reputations. - -Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at -all. - -The precise terms and conditions for copying, distribution and -modification follow. - - -TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - -0. This License applies to any program or other work which contains a -notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the Program -(independent of having been made by running the Program). Whether that -is true depends on what the Program does. - -1. You may copy and distribute verbatim copies of the Program's source -code as you receive it, in any medium, provided that you conspicuously -and appropriately publish on each copy an appropriate copyright notice -and disclaimer of warranty; keep intact all the notices that refer to -this License and to the absence of any warranty; and give any other -recipients of the Program a copy of this License along with the -Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a -fee. - -2. You may modify your copy or copies of the Program or any portion of -it, thus forming a work based on the Program, and copy and distribute -such modifications or work under the terms of Section 1 above, -provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that - in whole or in part contains or is derived from the Program or - any part thereof, to be licensed as a whole at no charge to all - third parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you - provide a warranty) and that users may redistribute the program - under these conditions, and telling the user how to view a copy - of this License. (Exception: if the Program itself is interactive - but does not normally print such an announcement, your work based - on the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - -3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of - Sections 1 and 2 above on a medium customarily used for software - interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - -4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt otherwise -to copy, modify, sublicense or distribute the Program is void, and -will automatically terminate your rights under this License. However, -parties who have received copies, or rights, from you under this -License will not have their licenses terminated so long as such -parties remain in full compliance. - -5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - -6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted -herein. You are not responsible for enforcing compliance by third -parties to this License. - - -7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - -8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - -9. The Free Software Foundation may publish revised and/or new -versions of the General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Program does not specify a -version number of this License, you may choose any version ever -published by the Free Software Foundation. - -10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the -author to ask for permission. For software which is copyrighted by the -Free Software Foundation, write to the Free Software Foundation; we -sometimes make exceptions for this. Our decision will be guided by the -two goals of preserving the free status of all derivatives of our free -software and of promoting the sharing and reuse of software generally. - -NO WARRANTY - -11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE -LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS -AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF -ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - -12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - -END OF TERMS AND CONDITIONS diff --git a/ext/giza-pp/GIZA++-v2/Makefile b/ext/giza-pp/GIZA++-v2/Makefile deleted file mode 100644 index 01488492..00000000 --- a/ext/giza-pp/GIZA++-v2/Makefile +++ /dev/null @@ -1,140 +0,0 @@ -.SUFFIXES: .out .o .c .e .r .f .y .l .s .p .cpp .alpha2o .pentiumo .sgio .alphao - -INSTALLDIR ?= /usr/local/bin/ - -#CXX = g++ - -CFLAGS = $(CFLAGS_GLOBAL) -Wall -Wno-parentheses -std=c++11 -#CFLAGS_OPT = $(CFLAGS) -O3 -DNDEBUG -DWORDINDEX_WITH_4_BYTE -O3 -DNDEBUG -DWORDINDEX_WITH_4_BYTE -ffast-math -CFLAGS_OPT = $(CFLAGS) -O3 -funroll-loops -DNDEBUG -DWORDINDEX_WITH_4_BYTE -DBINARY_SEARCH_FOR_TTABLE -DWORDINDEX_WITH_4_BYTE -CFLAGS_PRF = $(CFLAGS) -O2 -pg -DNDEBUG -DWORDINDEX_WITH_4_BYTE -CFLAGS_DBG = $(CFLAGS) -g -DDEBUG -DWORDINDEX_WITH_4_BYTE -CFLAGS_NRM = $(CFLAGS) -DWORDINDEX_WITH_4_BYTE -CFLAGS_VDBG = $(CFLAGS) -g -DDEBUG -DWORDINDEX_WITH_4_BYTE -DVDEBUG -SRC = *.cpp -TYPE = -LDFLAGS = - -include Makefile.src - -OBJ_DIR_PRF = profile/ -OBJ_DIR_OPT = optimized/ -OBJ_DIR_DBG = debug/ -OBJ_DIR_VDBG = vdebug/ -OBJ_DIR_NRM = norm/ -OBJ_OPT2 = ${SRC2:%.cpp=$(OBJ_DIR_OPT)%.o} -OBJ_OPT = ${SRC:%.cpp=$(OBJ_DIR_OPT)%.o} -OBJ_DBG = ${SRC:%.cpp=$(OBJ_DIR_DBG)%.o} -OBJ_VDBG = ${SRC:%.cpp=$(OBJ_DIR_VDBG)%.o} -OBJ_NRM = ${SRC:%.cpp=$(OBJ_DIR_NRM)%.o} -OBJ_PRF = ${SRC:%.cpp=$(OBJ_DIR_PRF)%.o} -OBJ_DIR = -DATE = `date +%d-%m-%Y` - -opt: GIZA++ snt2plain.out plain2snt.out snt2cooc.out - -GIZA++: $(OBJ_DIR_OPT) $(OBJ_OPT) - $(CXX) $(CFLAGS_OPT) $(OBJ_OPT) $(LDFLAGS) -o GIZA++ - -prf: GIZA++.prf - -GIZA++.prf: $(OBJ_DIR_PRF) $(OBJ_PRF) - $(CXX) $(CFLAGS_PRF) $(OBJ_PRF) -o GIZA++.prf $(LDFLAGS) - -dbg: GIZA++.dbg - -GIZA++.dbg: $(OBJ_DIR_DBG) $(OBJ_DBG) - $(CXX) $(CFLAGS_DBG) $(OBJ_DBG) -o GIZA++.dbg $(LDFLAGS) - -vdbg: GIZA++.vdbg - -GIZA++.vdbg: $(OBJ_DIR_VDBG) $(OBJ_VDBG) - $(CXX) $(CFLAGS_VDBG) $(OBJ_VDBG) -o GIZA++.vdbg $(LDFLAGS) - -nrm: GIZA++.nrm - -GIZA++.nrm: $(OBJ_DIR_NRM) $(OBJ_NRM) - $(CXX) $(CFLAGS_NRM) $(OBJ_NRM) -o GIZA++.nrm $(LDFLAGS) - -all: dbg opt nrm prf - -$(OBJ_DIR_PRF): $(OBJ_DIR) - -mkdir $(OBJ_DIR_PRF) - -$(OBJ_DIR_OPT): $(OBJ_DIR) - -mkdir $(OBJ_DIR_OPT) - -$(OBJ_DIR_DBG): $(OBJ_DIR) - -mkdir $(OBJ_DIR_DBG) - -$(OBJ_DIR_VDBG): $(OBJ_DIR) - -mkdir $(OBJ_DIR_VDBG) - -$(OBJ_DIR_NRM): $(OBJ_DIR) - -mkdir $(OBJ_DIR_NRM) - -$(OBJ_DIR): - -mkdir $(OBJ_DIR) - -$(OBJ_DIR_DBG)%.o: %.cpp - $(CXX) $(CFLAGS_DBG) -c $< -o $@ - -$(OBJ_DIR_VDBG)%.o: %.cpp - $(CXX) $(CFLAGS_VDBG) -c $< -o $@ - -$(OBJ_DIR_NRM)%.o: %.cpp - $(CXX) $(CFLAGS_NRM) -c $< -o $@ - -$(OBJ_DIR_PRF)%.o: %.cpp - $(CXX) $(CFLAGS_PRF) -c $< -o $@ - -$(OBJ_DIR_OPT)%.o: %.cpp - $(CXX) $(CFLAGS_OPT) -c $< -o $@ - -iinstall: opt prf dbg - -mkdir $(INSTALLDIR)/$(ARCH) - -cp GIZA++ $(INSTALLDIR)/GIZA++ - -cp GIZA++.prf $(INSTALLDIR)/GIZA++.prf - -cp GIZA++.dbg $(INSTALLDIR)/GIZA++.dbg - -install: opt - -mkdir $(INSTALLDIR) - -cp GIZA++ $(INSTALLDIR)/GIZA++ - -clean: - -rm -f $(OBJ_DIR_NRM)/*.o $(OBJ_DIR_DBG)/*.o $(OBJ_DIR_VDBG)/*.o $(OBJ_DIR_PRF)/*.o $(OBJ_DIR_OPT)/*.o - -rm -rf $(OBJ_DIR_NRM) $(OBJ_DIR_DBG) $(OBJ_DIR_VDBG) $(OBJ_DIR_PRF) $(OBJ_DIR_OPT) - -rm -f snt2plain.out plain2snt.out snt2cooc.out GIZA++ - - -backup: clean - tar cf - . | gzip -9 > ../GIZA++src.tar.gz - -depend: depend_CLEAN dependencies - -depend_CLEAN: - rm dependencies - -dependencies: - @(echo "#Automatically generated dependecy list" >> dependencies ;\ - $(CXX) -MM *.cpp $(CFLAGS_OPT) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_OPT)\1?g;print;}'>> dependencies) - @(echo "#Automatically generated dependecy list" >> dependencies ;\ - $(CXX) -MM *.cpp $(CFLAGS_DBG) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_DBG)\1?g;print;}'>> dependencies) - @(echo "#Automatically generated dependecy list" >> dependencies ;\ - $(CXX) -MM *.cpp $(CFLAGS_VDBG) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_VDBG)\1?g;print;}'>> dependencies) - @(echo "#Automatically generated dependecy list" >> dependencies ;\ - $(CXX) -MM *.cpp $(CFLAGS_NRM) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_NRM)\1?g;print;}'>> dependencies) - @(echo "#Automatically generated dependecy list" >> dependencies ;\ - $(CXX) -MM *.cpp $(CFLAGS_PRF) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_PRF)\1?g;print;}'>> dependencies) - --include dependencies - -snt2plain.out: snt2plain.cpp - $(CXX) $(LDFLAGS) -O3 -W -Wall snt2plain.cpp -o snt2plain.out - -plain2snt.out: plain2snt.cpp - $(CXX) $(LDFLAGS) -O3 -W -Wall plain2snt.cpp -o plain2snt.out - -snt2cooc.out: snt2cooc.cpp - $(CXX) $(LDFLAGS) -O3 -g -W -Wall snt2cooc.cpp -o snt2cooc.out - diff --git a/ext/giza-pp/GIZA++-v2/Makefile.definitions b/ext/giza-pp/GIZA++-v2/Makefile.definitions deleted file mode 100644 index e69de29b..00000000 diff --git a/ext/giza-pp/GIZA++-v2/Makefile.src b/ext/giza-pp/GIZA++-v2/Makefile.src deleted file mode 100644 index a6b8be7a..00000000 --- a/ext/giza-pp/GIZA++-v2/Makefile.src +++ /dev/null @@ -1,2 +0,0 @@ -SRC = Parameter.cpp myassert.cpp Perplexity.cpp model1.cpp model2.cpp model3.cpp getSentence.cpp TTables.cpp ATables.cpp AlignTables.cpp main.cpp NTables.cpp model2to3.cpp collCounts.cpp alignment.cpp vocab.cpp MoveSwapMatrix.cpp transpair_model3.cpp transpair_model5.cpp transpair_model4.cpp utility.cpp parse.cpp reports.cpp model3_viterbi.cpp model3_viterbi_with_tricks.cpp Dictionary.cpp model345-peg.cpp hmm.cpp HMMTables.cpp ForwardBackward.cpp - diff --git a/ext/giza-pp/GIZA++-v2/MoveSwapMatrix.cpp b/ext/giza-pp/GIZA++-v2/MoveSwapMatrix.cpp deleted file mode 100644 index 2b0c3a3c..00000000 --- a/ext/giza-pp/GIZA++-v2/MoveSwapMatrix.cpp +++ /dev/null @@ -1,231 +0,0 @@ -/* - -Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "MoveSwapMatrix.h" - -template -MoveSwapMatrix::MoveSwapMatrix(const TRANSPAIR&_ef, const alignment&_a) - : alignment(_a), ef(_ef), l(ef.get_l()), m(ef.get_m()), _cmove(l+1, m+1), _cswap(m+1, m+1), - delmove(l+1, m+1,0),delswap(m+1, m+1,0),changed(l+2, 0), changedCounter(1), - modelnr(_ef.modelnr()),lazyEvaluation(0),centerDeleted(0) -{ - double thisValue=ef.scoreOfAlignmentForChange((*this)); - if( lazyEvaluation==0) - for(WordIndex j=1;j<=m;j++)updateJ(j, 0,thisValue); -} - -template -void MoveSwapMatrix::updateJ(WordIndex j, bool useChanged,double thisValue) -{ - massert( lazyEvaluation==0 ); - for(WordIndex i=0;i<=l;i++) - if( (useChanged==0||changed[i]!=changedCounter) ) - if( get_al(j)!=i ) - _cmove(i, j)=ef.scoreOfMove((*this), i, j,thisValue); - else - _cmove(i, j)=1.0; - for(WordIndex j2=j+1;j2<=m;j2++) - if( get_al(j)!=get_al(j2) ) - _cswap(j, j2)=ef.scoreOfSwap((*this), j, j2,thisValue); - else - _cswap(j, j2)=1.0; - for(WordIndex j2=1;j2 -void MoveSwapMatrix::updateI(WordIndex i,double thisValue) -{ - massert( lazyEvaluation==0); - for(WordIndex j=1;j<=m;j++) - if( get_al(j)!=i ) - _cmove(i, j)=ef.scoreOfMove((*this), i, j,thisValue); - else - _cmove(i, j)=1.0; -} - -template -void MoveSwapMatrix::printWrongs()const{ - for(WordIndex i=0;i<=l;i++) - { - for(WordIndex j=1;j<=m;j++) - if( get_al(j)==i) - cout << "A"; - else - { - LogProb real=_cmove(i, j), wanted=ef.scoreOfMove((*this), i, j); - if( fabs(1.0-real/wanted)>1e-3 ) - cout << 'b'; - else if(fabs(1.0-real/wanted)>1e-10 ) - cout << 'e'; - else if(real!=wanted) - cout << 'E'; - else - cout << ' '; - } - cout << endl; - } - cout << endl; - for(WordIndex j=1;j<=m;j++) - { - for(WordIndex j1=1;j1<=m;j1++) - if( j1>j ) - { - if( get_al(j)==get_al(j1) ) - cout << 'A'; - else - cout << (_cswap(j, j1)==ef.scoreOfSwap((*this), j, j1)); - } - else - cout << ' '; - cout << endl; - } - massert(0); -} -template -bool MoveSwapMatrix::isRight()const{ - if( lazyEvaluation ) - return 1; - for(WordIndex i=0;i<=l;i++) - for(WordIndex j=1;j<=m;j++) - if( get_al(j)!=i && (!(doubleEqual(_cmove(i, j), ef.scoreOfMove((*this), i, j)))) ) - { - cerr << "DIFF: " << i << " " << j << " " << _cmove(i, j) << " " << ef.scoreOfMove((*this), i, j) << endl; - return 0; - } - for(WordIndex j=1;j<=m;j++) - for(WordIndex j1=1;j1<=m;j1++) - if( j1>j&&get_al(j)!=get_al(j1)&&(!doubleEqual(_cswap(j, j1), ef.scoreOfSwap((*this), j, j1))) ) - { - cerr << "DIFFERENT: " << j << " " << j1 << " " << _cswap(j, j1) << " " << ef.scoreOfSwap((*this), j, j1) << endl; - return 0; - } - return 1; -} - -template -void MoveSwapMatrix::doMove(WordIndex _i, WordIndex _j) -{ - WordIndex old_i=get_al(_j); - if( lazyEvaluation ) - set(_j,_i); - else - { - if ( modelnr==5||modelnr==6 ) - { - set(_j, _i); - double thisValue=ef.scoreOfAlignmentForChange((*this)); - for(WordIndex j=1;j<=m;j++)updateJ(j, 0,thisValue); - } - else if ( modelnr==4 ) - { - changedCounter++; - for(unsigned int k=prev_cept(old_i);k<=next_cept(old_i);++k)changed[k]=changedCounter; - for(unsigned int k=prev_cept(_i);k<=next_cept(_i);++k)changed[k]=changedCounter; - set(_j, _i); - for(unsigned int k=prev_cept(old_i);k<=next_cept(old_i);++k)changed[k]=changedCounter; - for(unsigned int k=prev_cept(_i);k<=next_cept(_i);++k)changed[k]=changedCounter; - double thisValue=ef.scoreOfAlignmentForChange((*this)); - for(unsigned int i=0;i<=l;i++) - if(changed[i]==changedCounter) - updateI(i,thisValue); - for(unsigned int j=1;j<=m;j++) - if( changed[get_al(j)]==changedCounter ) - updateJ(j, 1,thisValue); - } - else - { - assert(modelnr==3); - set(_j, _i); - changedCounter++; - double thisValue=ef.scoreOfAlignmentForChange((*this)); - updateI(old_i,thisValue); - changed[old_i]=changedCounter; - updateI(_i,thisValue); - changed[_i]=changedCounter; - for(WordIndex j=1;j<=m;j++) - if( get_al(j)==_i || get_al(j)==old_i ) - updateJ(j, 1,thisValue); - } - } -} -template -void MoveSwapMatrix::doSwap(WordIndex _j1, WordIndex _j2) -{ - assert( cswap(_j1, _j2)>1 ); - WordIndex i1=get_al(_j1), i2=get_al(_j2); - if( lazyEvaluation==1 ) - { - set(_j1, i2); - set(_j2, i1); - } - else - { - if ( modelnr==5||modelnr==6 ) - { - set(_j1, i2); - set(_j2, i1); - double thisValue=ef.scoreOfAlignmentForChange((*this)); - for(WordIndex j=1;j<=m;j++)updateJ(j, 0,thisValue); - } - else if( modelnr==4 ) - { - changedCounter++; - for(unsigned int k=prev_cept(i1);k<=next_cept(i1);++k)changed[k]=changedCounter; - for(unsigned int k=prev_cept(i2);k<=next_cept(i2);++k)changed[k]=changedCounter; - set(_j1, i2); - set(_j2, i1); - double thisValue=ef.scoreOfAlignmentForChange((*this)); - for(unsigned int i=0;i<=l;i++) - if(changed[i]==changedCounter) - updateI(i,thisValue); - for(unsigned int j=1;j<=m;j++) - if( changed[get_al(j)]==changedCounter ) - updateJ(j, 1,thisValue); - } - else - { - assert(modelnr==3); - set(_j1, i2); - set(_j2, i1); - changedCounter++; - double thisValue=ef.scoreOfAlignmentForChange((*this)); - updateI(i1,thisValue); - changed[i1]=changedCounter; - updateI(i2,thisValue); - changed[i2]=changedCounter; - updateJ(_j1, 1,thisValue); - updateJ(_j2, 1,thisValue); - } - } -} - -#include "transpair_model3.h" -#include "transpair_model4.h" -#include "transpair_model5.h" -#include "transpair_modelhmm.h" -template class MoveSwapMatrix; -template class MoveSwapMatrix; -template class MoveSwapMatrix; -template class MoveSwapMatrix; diff --git a/ext/giza-pp/GIZA++-v2/MoveSwapMatrix.h b/ext/giza-pp/GIZA++-v2/MoveSwapMatrix.h deleted file mode 100644 index b1bbf15f..00000000 --- a/ext/giza-pp/GIZA++-v2/MoveSwapMatrix.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/*-- -MoveSwapMatrix: Efficient representation for moving and swapping -around in IBM3 training. -Franz Josef Och (30/07/99) ---*/ -#ifndef moveswap2_costs_h_defined -#define moveswap2_costs_h_defined -#include "alignment.h" -#include "transpair_model3.h" -#include "myassert.h" - -extern short DoViterbiTraining; - -template -class MoveSwapMatrix : public alignment -{ - private: - const TRANSPAIR&ef; - const WordIndex l, m; - Array2 > _cmove, _cswap; - Array2 > delmove,delswap; - Vector changed; - int changedCounter; - const int modelnr; - bool lazyEvaluation; - bool centerDeleted; - public: - bool check()const - { - return 1; - } - const TRANSPAIR&get_ef()const - {return ef;} - bool isCenterDeleted()const - {return centerDeleted;} - bool isLazy()const - {return lazyEvaluation;} - MoveSwapMatrix(const TRANSPAIR&_ef, const alignment&_a); - void updateJ(WordIndex j, bool,double thisValue); - void updateI(WordIndex i,double thisValue); - void doMove(WordIndex _i, WordIndex _j); - void doSwap(WordIndex _j1, WordIndex _j2); - void delCenter() - { - centerDeleted=1; - } - void delMove(WordIndex x, WordIndex y) - { - delmove(x,y)=1; - } - void delSwap(WordIndex x, WordIndex y) - { - massert(y>x); - delswap(x,y)=1; - delswap(y,x)=1; - } - bool isDelMove(WordIndex x, WordIndex y)const - { - return DoViterbiTraining||delmove(x,y); - } - bool isDelSwap(WordIndex x, WordIndex y)const - { - massert(y>x); - return DoViterbiTraining||delswap(x,y); - } - LogProb cmove(WordIndex x, WordIndex y)const - { - massert( get_al(y)!=x ); - massert( delmove(x,y)==0 ); - if( lazyEvaluation ) - return ef.scoreOfMove(*this,x,y); - else - { - return _cmove(x, y); - } - } - LogProb cswap(WordIndex x, WordIndex y)const - { - massert(xx); - return _cswap(x, y); - } - } - void printWrongs()const; - bool isRight()const; - friend ostream&operator<<(ostream&out, const MoveSwapMatrix&m) - {return out << (alignment)m << "\nEF:\n"<< m.ef << "\nCMOVE\n"< -#include "defs.h" -#include -#include "Parameter.h" - -GLOBAL_PARAMETER(double,NTablesFactorGraphemes,"nSmooth","smoothing for fertility parameters (good value: 64): weight for wordlength-dependent fertility parameters",PARLEV_SMOOTH,64.0); -GLOBAL_PARAMETER(double,NTablesFactorGeneral,"nSmoothGeneral","smoothing for fertility parameters (default: 0): weight for word-independent fertility parameters",PARLEV_SMOOTH,0.0); - -template -void nmodel::printNTable(int noEW, const char* filename, - const Vector& evlist, - bool actual) const - // prints the fertility table but with actual sourcce words (not their id) -{ - cerr << "Dumping nTable to: " << filename << '\n'; - ofstream of(filename); - VALTYPE p ; - WordIndex k, i ; - for(i=1; int(i) < noEW; i++){ - if (evlist[i].freq > 0){ - if (actual) - of << evlist[i].word << ' ' ; - else - of << i << ' ' ; - for( k=0; k < MAX_FERTILITY; k++){ - p = getValue(i, k); - if (p <= PROB_SMOOTH) - p = 0; - of << p << ' '; - } - of << '\n'; - } - } -} - -template -void nmodel::readNTable(const char *filename){ - /* This function reads the n table from a file. - Each line is of the format: source_word_id p0 p1 p2 ... pn - This is the inverse operation of the printTable function. - NAS, 7/11/99 - */ - ifstream inf(filename); - cerr << "Reading fertility table from " << filename << "\n"; - if(!inf){ - cerr << "\nERROR: Cannot open " << filename <<"\n"; - return; - } - - VALTYPE prob; - WordIndex tok, i; - int nFert=0; - while(!inf.eof()){ - nFert++; - inf >> ws >> tok; - if (tok > MAX_VOCAB_SIZE){ - cerr << "NTables:readNTable(): unrecognized token id: " << tok - <<'\n'; - exit(-1); - } - for(i = 0; i < MAX_FERTILITY; i++){ - inf >> ws >> prob; - getRef(tok, i)=prob; - } - } - cerr << "Read " << nFert << " entries in fertility table.\n"; - inf.close(); -} - -template class nmodel; -//template class nmodel; diff --git a/ext/giza-pp/GIZA++-v2/NTables.h b/ext/giza-pp/GIZA++-v2/NTables.h deleted file mode 100644 index 4bb05659..00000000 --- a/ext/giza-pp/GIZA++-v2/NTables.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef _ntables_h -#define _ntables_h 1 -#include "Array2.h" -#include "Vector.h" -#include -#include "defs.h" -#include "vocab.h" -#include "myassert.h" -#include "Globals.h" - -extern double NTablesFactorGraphemes,NTablesFactorGeneral; - -template -class nmodel -{ - private: - Array2 > ntab; - public: - nmodel(int maxw, int maxn) - : ntab(maxw, maxn, 0.0) - {} - VALTYPE getValue(int w, unsigned int n)const - { - massert(w!=0); - if(n>=ntab.getLen2()) - return 0.0; - else - return max(ntab(w, n), VALTYPE(PROB_SMOOTH)); - } - VALTYPE&getRef(int w, int n) - { - //massert(w!=0); - return ntab(w, n); - } - template - void normalize(nmodel&write,const Vector* _evlist)const -{ - int h1=ntab.getLen1(), h2=ntab.getLen2(); - int nParams=0; - if( _evlist&&(NTablesFactorGraphemes||NTablesFactorGeneral) ) - { - size_t maxlen=0; - const Vector&evlist=*_evlist; - for(unsigned int i=1;i > counts(maxlen+1,MAX_FERTILITY+1,0.0); - Vector nprob_general(MAX_FERTILITY+1,0.0); - for(unsigned int i=1;i& evlist, bool) const; - void readNTable(const char *filename); - -}; - -#endif diff --git a/ext/giza-pp/GIZA++-v2/Parameter.cpp b/ext/giza-pp/GIZA++-v2/Parameter.cpp deleted file mode 100644 index 8379a259..00000000 --- a/ext/giza-pp/GIZA++-v2/Parameter.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "Parameter.h" -#include -#include -#include - - -bool absolutePathNames=0; -string ParameterPathPrefix; -bool ParameterChangedFlag=0; - -bool writeParameters(ofstream&of,const ParSet&parset,int level) -{ - if(!of)return 0; - for(ParSet::const_iterator i=parset.begin();i!=parset.end();++i) - { - if(((*i)->getLevel()==level||level==-1)&&(*i)->onlyCopy==0) - { - ostringstream os; - (*i)->printValue(os); - os << ends; - string s(os.str()); - of << (*i)->getString() << " "; - if( absolutePathNames&&(*i)->isFilename()&&s.length()&&s[0]!='/' ) - { - char path[1024]; - getcwd(path,1024); - of << path << '/'; - } - if( ParameterPathPrefix.length()&&(*i)->isFilename()&&s.length()&&s[0]!='/' ) - of << ParameterPathPrefix << '/'; - (*i)->printValue(of); - of << endl; - } - } - return 1; -} - -bool readParameters(ifstream&f,const ParSet&parset,int verb,int level) -{ - string s; - if(!f)return 0; - while(getline(f,s)) - { - istringstream eingabe(s); - string s1,s2; - eingabe>>s1>>s2; - if(makeSetCommand(s1,s2,parset,verb,level)==0) - cerr << "ERROR: could not set: (C) " << s1 << " " << s2 << endl; - } - return 1; -} - - -bool makeSetCommand(string _s1,string s2,const ParSet&parset,int verb,int level) -{ - ParPtr anf; - int anfset=0; - string s1=simpleString(_s1); - for(ParSet::const_iterator i=parset.begin();i!=parset.end();++i) - { - if( *(*i)==s1 ) - { - if( level==-1 || level==(*i)->getLevel() ) - (*i)->setParameter(s2,verb); - else if(verb>1) - cerr << "ERROR: Could not set: (A) " << s1 << " " << s2 << " " << level << " " << (*i)->getLevel() << endl; - return 1; - } - else if( (*i)->getString().substr(0,s1.length())==s1 ) - { - anf=(*i);anfset++; - } - } - if(anfset==1) - { - if( level==-1 || level==anf->getLevel() ) - anf->setParameter(s2,verb); - else if( verb>1 ) - cerr << "ERROR: Could not set: (B) " << s1 << " " << s2 << " " << level << " " << anf->getLevel() << endl; - return 1; - } - if( anfset>1 ) - cerr << "ERROR: ambiguous parameter '" << s1 << "'.\n"; - if( anfset==0 ) - cerr << "ERROR: parameter '" << s1 << "' does not exist.\n"; - return 0; -} - -ostream& printPars(ostream&of,const ParSet&parset,int level) -{ - if(!of)return of; - for(ParSet::const_iterator i=parset.begin();i!=parset.end();++i) - { - if(((*i)->getLevel()==level||level==-1)&&(*i)->onlyCopy==0) - { - (*i)->printAt(of); - of << endl; - } - } - return of; -} - -string simpleString(const string s) -{ - string k; - for(unsigned int i=0;i='a'&&c[0]<='z')||(c[0]>='0'&&c[0]<='9') ) - k += c; - } - return k; -} - - -ParSet&getGlobalParSet() -{ - static ParSet x; - return x; -} diff --git a/ext/giza-pp/GIZA++-v2/Parameter.h b/ext/giza-pp/GIZA++-v2/Parameter.h deleted file mode 100644 index 9a6239da..00000000 --- a/ext/giza-pp/GIZA++-v2/Parameter.h +++ /dev/null @@ -1,200 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef PARAMETER_H_DEFINED -#define PARAMETER_H_DEFINED - -#include "mystl.h" -#include -#include "Pointer.h" -#include -#include "Globals.h" -#include -#include - -inline unsigned int mConvert(const string&s,unsigned int &i) -{ - if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return i=1; } - if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return i=0;} - return i=atoi(s.c_str()); -} -inline int mConvert(const string&s,int &i){ - if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return i=1;} - if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return i=0;} - return i=atoi(s.c_str()); -} -inline double mConvert(const string&s,double &d) { return d=atof(s.c_str()); } -inline double mConvert(const string&s,float &d) { return d=atof(s.c_str()); } -inline string mConvert(const string&s,string&n) { return n=s; } -inline bool mConvert(const string&s,bool&n) { - if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return n=1;} - if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return n=0;} - return n=atoi(s.c_str()); -} -inline short mConvert(const string&s,short&n) { - if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return n=1;} - if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return n=0;} - return n=atoi(s.c_str()); -} -inline unsigned short mConvert(const string&s,unsigned short&n) { - if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return n=1;} - if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return n=0;} - return n=atoi(s.c_str()); -} - -string simpleString(const string s); - -inline int Hashstring(const string& s) -{ - int sum=0; - string::const_iterator i=s.begin(),end=s.end(); - for(;i!=end;i++)sum=5*sum+(*i); - return sum; -} - -class _Parameter -{ - protected: - string name; - bool *ifChanged; - string description; - int level; - bool filename; - public: - int onlyCopy; - _Parameter(string n,bool&b,string desc,int _level,bool _onlyCopy) - : name(simpleString(n)),ifChanged(&b),description(desc),level(_level),filename(0),onlyCopy(_onlyCopy) {} - virtual ~_Parameter(){}; - bool operator==(const string&s)const - { return name== simpleString(s); } - void setChanged() - { *ifChanged=true; } - virtual bool setParameter(string s2,int)=0; - virtual ostream&printAt(ostream&out)=0; - virtual ostream&printValue(ostream&out)=0; - const string&getString() const { return name; } - int getLevel() const { return level;} - bool isFilename() { return filename;} - void setFilename(bool x=1) { filename=x;} - friend bool operator==(const _Parameter&a,const _Parameter&b) - { return a.name==b.name; } - friend bool operator<(const _Parameter&a,const _Parameter&b) - { return a.name -class Parameter : public _Parameter -{ - private: - T*t; - public: - Parameter(string n,bool&b,string desc,T&_t,int level=0,bool onlyCopy=0) - : _Parameter(n,b,desc,level,onlyCopy),t(&_t) {} - virtual ~Parameter(){} - virtual bool setParameter(string s2,int verb) - { - T x; - if( !(*t==mConvert(s2,x))) - { - bool printedFirst=0; - if( verb>1 ) - { - cout << "Parameter '"< ParPtr; - -class ParSet : public set -{ - public: - void insert(const ParPtr&x) - { - if( count(x)!=0 ) - cerr << "ERROR: element " << x->getString() << " already inserted.\n"; - set::insert(x); - } -}; - -bool makeSetCommand(string s1,string s2,const ParSet&pars,int verb=1,int level= -1); -ostream&printPars(ostream&out,const ParSet&pars,int level=-1); -bool writeParameters(ofstream&of,const ParSet&parset,int level=0); -bool readParameters(ifstream&f,const ParSet&parset,int verb=2,int level=0); -ParSet&getGlobalParSet(); -extern bool ParameterChangedFlag; -templateconst T&addGlobalParameter(const char *name,const char *description,int level,T*adr,const T&init) -{ - *adr=init; - getGlobalParSet().insert(new Parameter(name,ParameterChangedFlag,description,*adr,level)); - return init; -} -templateconst T&addGlobalParameter(const char *name,const char *name2,const char *description,int level,T*adr,const T&init) -{ - *adr=init; - getGlobalParSet().insert(new Parameter(name,ParameterChangedFlag,description,*adr,level)); - getGlobalParSet().insert(new Parameter(name2,ParameterChangedFlag,description,*adr,-1)); - return init; -} -templateconst T&addGlobalParameter(const char *name,const char *name2,const char *name3,const char *description,int level,T*adr,const T&init) -{ - *adr=init; - getGlobalParSet().insert(new Parameter(name,ParameterChangedFlag,description,*adr,level)); - getGlobalParSet().insert(new Parameter(name2,ParameterChangedFlag,description,*adr,-1)); - getGlobalParSet().insert(new Parameter(name3,ParameterChangedFlag,description,*adr,-1)); - return init; -} -templateconst T&addGlobalParameter(const char *name,const char *name2,const char *name3,const char *name4,const char *description,int level,T*adr,const T&init) -{ - *adr=init; - getGlobalParSet().insert(new Parameter(name,ParameterChangedFlag,description,*adr,level)); - getGlobalParSet().insert(new Parameter(name2,ParameterChangedFlag,description,*adr,-1)); - getGlobalParSet().insert(new Parameter(name3,ParameterChangedFlag,description,*adr,-1)); - getGlobalParSet().insert(new Parameter(name4,ParameterChangedFlag,description,*adr,-1)); - return init; -} -void MakeParameterOptimizing(istream&file,string resultingParameters); - -#define GLOBAL_PARAMETER(TYP,VARNAME,NAME,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,DESCRIPTION,LEVEL,&VARNAME,INIT); -#define GLOBAL_PARAMETER2(TYP,VARNAME,NAME,NAME2,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,NAME2,DESCRIPTION,LEVEL,&VARNAME,INIT); -#define GLOBAL_PARAMETER3(TYP,VARNAME,NAME,NAME2,NAME3,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,NAME2,NAME3,DESCRIPTION,LEVEL,&VARNAME,INIT); -#define GLOBAL_PARAMETER4(TYP,VARNAME,NAME,NAME2,NAME3,NAME4,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,NAME2,NAME3,NAME4,DESCRIPTION,LEVEL,&VARNAME,INIT); - -void setParameterLevelName(unsigned int i,string x); - -#endif diff --git a/ext/giza-pp/GIZA++-v2/Perplexity.cpp b/ext/giza-pp/GIZA++-v2/Perplexity.cpp deleted file mode 100644 index d44dec58..00000000 --- a/ext/giza-pp/GIZA++-v2/Perplexity.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/* Perplexity.cc - * ============= - * Mike Jahr, 7/21/99 - * Machine Translation group, WS99 - * Center for Language and Speech Processing - * - * Last Modified by: Yaser Al-Onaizan, August 17, 1999 - * - * Simple class used to calculate cross entropy and perplexity - * of models. - */ - -#include "Perplexity.h" - -void Perplexity::record(string model){ - modelid.push_back(model); - perp.push_back(perplexity()); - ce.push_back(cross_entropy()); -} diff --git a/ext/giza-pp/GIZA++-v2/Perplexity.h b/ext/giza-pp/GIZA++-v2/Perplexity.h deleted file mode 100644 index 50102806..00000000 --- a/ext/giza-pp/GIZA++-v2/Perplexity.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/* Perplexity.h - * ============ - * Mike Jahr, 7/15/99 - * Machine Translation group, WS99 - * Center for Language and Speech Processing - * - * Last Modified by: Yaser Al-Onaizan, August 17, 1999 - * - * Simple class used to calculate cross entropy and perplexity - * of models. - */ - -#ifndef _PERPLEXITY_H -#define _PERPLEXITY_H - -#include -#include -#include "Vector.h" -#include "defs.h" -#include "Array2.h" -#include "Globals.h" - -#define CROSS_ENTROPY_BASE 2 - -class Perplexity { - private: - double sum; - double wc; - Array2 > *E_M_L; - Vector modelid; - Vector perp; - Vector ce; - Vector name ; - public: - ~Perplexity() { delete E_M_L;} - Perplexity() { - E_M_L = new Array2 >(MAX_SENTENCE_LENGTH,MAX_SENTENCE_LENGTH); - unsigned int l, m ; - Vector fact(MAX_SENTENCE_LENGTH, 1.0); - for (m = 2 ; m < MAX_SENTENCE_LENGTH ; m++) - fact[m] = fact[m-1] * m ; - for (m = 1 ; m < MAX_SENTENCE_LENGTH ; m++) - for (l = 1 ; l < MAX_SENTENCE_LENGTH ; l++) { - (*E_M_L)(l, m) = log (pow((LAMBDA * l), double(m)) * exp(-LAMBDA * double(l)) / - (fact[m])) ; - } - sum = 0 ; - wc = 0; - perp.clear(); - ce.clear(); - name.clear(); - } - inline void clear() { - sum = 0 ; - wc = 0 ; - } - size_t size() const {return(min(perp.size(), ce.size()));} - inline void addFactor(const double p, const double count, const int l, - const int m,bool withPoisson) { - wc += count * m ; // number of french words - sum += count * ( (withPoisson?((*E_M_L)(l, m)):0.0) + p) ; - } - inline double perplexity() const { - return exp( -1*sum / wc); - } - - inline double cross_entropy() const { - return (-1.0*sum / (log(double(CROSS_ENTROPY_BASE)) * wc)); - } - - inline double word_count() const { - return wc; - } - - inline double getSum() const { - return sum ; - } - - void record(string model); - - friend void generatePerplexityReport(const Perplexity&, const Perplexity&, - const Perplexity&, const Perplexity&, - ostream&, int, int, bool); -}; - - -#endif diff --git a/ext/giza-pp/GIZA++-v2/Pointer.h b/ext/giza-pp/GIZA++-v2/Pointer.h deleted file mode 100644 index 4892656c..00000000 --- a/ext/giza-pp/GIZA++-v2/Pointer.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef HEADER_Pointer_DEFINED -#define HEADER_Pointer_DEFINED - -#include -#include - -template -class SmartPointer -{ - protected: - T*p; - public: - SmartPointer(T*_p=0) - : p(_p) {} - inline T&operator*() const - {return *p;} - inline T*operator->() const - {return p;} - inline operator bool() const - {return p!=0;} - inline T*ptr() const - { return p; } -}; -template inline ostream &operator<<(ostream&out,const SmartPointer&s) -{if( s.ptr() )return out << *s;else return out <<"nullpointer";} - - -template -class SmartPointerConst -{ - protected: - const T*p; - public: - SmartPointerConst(const T*_p=0) - : p(_p) {} - inline const T&operator*() const - {return *p;} - inline const T*operator->() const - {return p;} - inline operator bool() const - {return p!=0;} - inline const T*ptr() const - { return p; } -}; -template inline ostream &operator<<(ostream&out,const SmartPointerConst&s) -{if( s.ptr() )return out << *s;else return out <<"nullpointer";} - -template -class UP : public SmartPointer -{ - public: - UP(T*_p=0) - : SmartPointer(_p) {} -}; -template inline bool operator==(const UP&s1,const UP&s2) -{return s1.ptr()==s2.ptr();} -template inline bool operator<(const UP&s1,const UP&s2) -{return s1.ptr() < s2.ptr();} -template inline int Hash(const UP &wp) -{if(wp.ptr())return Hash(*wp);else return 0;} - - -template -class UPConst : public SmartPointerConst -{ - public: - UPConst(const T*_p=0) - : SmartPointerConst(_p) {} -}; -template inline bool operator==(const UPConst&s1,const UPConst&s2) -{return s1.ptr()==s2.ptr();} -template inline bool operator<(const UPConst&s1,const UPConst&s2) -{return s1.ptr() inline int Hash(const UPConst &wp) -{if(wp.ptr())return Hash(*wp);else return 0;} - - -template -class MP : public SmartPointer -{ - public: - MP(T*_p=0) - : SmartPointer(_p) {} -}; -template inline bool operator==(const MP&s1,const MP&s2) -{assert(s1);assert(s2);return *s1==*s2;} -template inline bool operator<(const MP&s1,const MP&s2) -{assert(s1);assert(s2);return *s1 < *s2;} -template inline int Hash(const MP &wp) -{if(wp.ptr())return Hash(*wp);else return 0;} - - -template -class MPConst : public SmartPointerConst -{ - public: - MPConst(const T*_p=0) - : SmartPointerConst(_p) {} -}; -template inline bool operator==(const MPConst&s1,const MPConst&s2) -{assert(s1);assert(s2);return *s1== *s2;} -template inline bool operator<(const MPConst&s1,const MPConst&s2) -{assert(s1);assert(s2);return *s1 < *s2;} -template inline int Hash(const MPConst &wp) -{if(wp.ptr())return Hash(*wp);else return 0;} - - -template -class DELP : public SmartPointer -{ - private: - DELP(const DELP&x); - public: - const DELP&operator=(DELP&x) - { - delete this->p; - this->p=x.p;x.p=0; - return *this; - } - - ~DELP() - { delete this->p;this->p=0;} - DELP(T*_p=0) - : SmartPointer(_p) {} - void set(T*_p) - { - delete this->p; - this->p=_p; - } - friend bool operator==(const DELP&s1,const DELP&s2) - { - return *(s1.p)== *(s2.p); - } - friend bool operator<(const DELP&s1,const DELP&s2) - { - return *(s1.p) < *(s2.p); - } - friend inline int Hash(const DELP &wp) - { - if(wp.p) - return Hash(*wp.p); - else - return 0; - } -}; -#endif - - - - - - - diff --git a/ext/giza-pp/GIZA++-v2/README b/ext/giza-pp/GIZA++-v2/README deleted file mode 100644 index 25af2886..00000000 --- a/ext/giza-pp/GIZA++-v2/README +++ /dev/null @@ -1,508 +0,0 @@ -======================================================================== -GIZA++ is an extension of the program GIZA. -It is a program for learning statistical translation models from -bitext. It is an implementation of the models described in -(Brown et al., 1993), (Vogel et al., 1996), (Och et al., 2000a), -(Och et al., 2000b). -======================================================================== - - - -CONTENTS of this README file: - -Part I: GIZA Package Contents -Part II: How To Compile GIZA -Part III: How to Run GIZA -Part IV: Input File Formats - A. VOCABULARY FILES - B. Bitext Files - C. Dictionary File (optional) -Part V: Output File Formats: - A. PROBABILITY TABLES - 1. T TABLE (translation table) - 2. N TABLE (Fertility table) - 3. P0 TABLE - 4. A TABLE - 5. D3 TABLE - 6. D4 TABLE - 7. D5 TABLE - 8. HMM TABLE - B. ALIGNMENT FILE - C. Cross Entropy and Perplexity Files - D. Revised Vocabulary files -Part VI: Literature -Part VII: New features - -HISTORY of this README file: - -GIZA++: -edited: 11 Jan. 2000, Franz Josef Och -GIZA: -edited: 16 Aug. 1999, Dan Melamed -edited: 13 Aug. 1999, Yaser Al-Onaizan -edited: 20 July 1999, Yaser Al-Onaizan -edited: 15 July 1999, Yaser Al-Onaizan -edited: 13 July 1999, Noah Smith -======================================================================== - -Part 0: What is GIZA++ - -GIZA++ is an extension of the program GIZA (part of the SMT toolkit -EGYPT - http://www.clsp.jhu.edu/ws99/projects/mt/toolkit/ ) which was -developed by the Statistical Machine Translation team during the -summer workshop in 1999 at the Center for Language and Speech -Processing at Johns-Hopkins University (CLSP/JHU). GIZA++ includes a -lot of additional features. The extensions of GIZA++ were designed and -written by Franz Josef Och. - -Features of GIZA++ not in GIZA: - -- Implements full IBM-4 alignment model with a dependency of word -classes as described in (Brown et al. 1993) - -- Implements IBM-5: dependency on word classes, smoothing, ... - -- Implements HMM alignment model: Baum-Welch training, Forward-Backward -algorithm, empty word, dependency on word classes, transfer to -fertility models, ... - -- Implementation of a variant of the IBM-3 and IBM-4 -(-deficientDistortionModel 1) models which allow the training of -p0 - -- Smoothing for fertility, distortion/alignment parameters - -- Significant more efficient training of the fertility models - -- Correct implementation of pegging as described in (Brown et -al. 1993), implemented a series of heuristics in order to make pegging -sufficiently efficient - -- Completely new parameter mechanism: allows to easily add additional -parameters - -- Improved perplexity calculation for models IBM-1, IBM-2 and HMM (the -parameter of the Poisson-distribution of the sentence lengths is -computed automatically from the used training corpus) - -======================================================================== -Part I: GIZA++ Package Programs - -GIZA++: GIZA++ itself - -plain2snt.out: simple tool to transform plain text into GIZA text -format - -snt2plain.out: simple tool to transform GIZA text format into plain -text - -trainGIZA++.sh: Shell script to perform standard training given a -corpus in GIZA text format - -======================================================================== -Part II: How To Compile GIZA++ - -In order to compile GIZA++ you may need: -- recent version of the GNU compiler (2.95 or higher) -- recent version of assembler and linker which do not have restrictions - with respect to the length of symbol names - -There is a make file in the src directory that will take care of the -compilation. The most important targets are: - -GIZA++: generates an optimized version - -GIZA++.dbg: generates the debug version - -depend: generates the "dependencies" file (make this whenever you add -source or header files to the package. - -======================================================================== -Part III: How To run GIZA++ - -It's simple: - -GIZA++ [config-file] [options] - -All options which expect a parameter could also be used in the -parameter file. For example the command line options - - GIZA++ -S S.vcb -T T.vcb -C ST.snt - -corresponds to the config file: - - S: S.vcb - T: T.vcb - C: ST.snt - -If you call GIZA++ without a parameter you get a list of all the -options. The option names form GIZA are normally still valid. The -default values of the parameters typically are optimized with respect -to the corpora I use and typically give good results. It is -nevertheless important that these parameters are always optimized for -every new task. - -========================================================================== -Part IV: Input File Formats - -A. VOCABULARY FILES - -Each entry is stored on one line as follows: - - uniq_id1 string1 no_occurrences1 - uniq_id2 string2 no_occurrences2 - uniq_id3 string3 no_occurrences3 - .... - -Here is a sample from an English vocabulary file: - -627 abandon 10 -628 abandoned 17 -629 abandoning 2 -630 abandonment 12 -631 abatement 8 -632 abbotsford 2 - -uniq_ids are sequential positive integer numbers. 0 is reserved for -the special token NULL. - - -B. Bitext Files - -Each sentence pair is stored in three lines. The first line -is the number of times this sentence pair occurred. The second line is -the source sentence where each token is replaced by its unique integer -id from the vocabulary file and the third is the target sentence in -the same format. - -Here's a sample of 3 sentences from English/french corpus: - -1 -1 1 226 5008 621 6492 226 6377 6813 226 9505 5100 6824 226 5100 5222 0 614 10243 613 -2769 155 7989 585 1 578 6503 585 8242 578 8142 8541 578 12328 6595 8550 578 6595 6710 1 -1 -1 1 226 6260 11856 11806 1293 -11 1 1 11 155 14888 2649 11447 9457 8488 4168 -1 -1 1 226 7652 1 226 5337 226 6940 12089 5582 8076 12050 -1 1 155 4140 6812 153 1 154 155 14668 15616 10524 9954 1392 - -C. Dictionary File - -This is optional. The dictionary file is of the format: - -target_word_id source_word_id - -The list should be sorted by the target_word_id. - -C. Dictionary Files - -If you provide a dictionary and list it in the configuration file, -GIZA++ will change the cooccurrence counting in the first iteration -of model 1 to honor the so-called "Dictionary Constraint": - - In parallel sentences "e1 ... en" and "f1 ... fm", - ei and fi are counted as a coocurrence pair if one of two - conditions is met: 1.) ei and fi occur as an entry in the - dictionary, or 2.) ei does not occur in the dictionary with - any fj (1 <= j <= m) and fi does not occur in the dictionary - with any ej (1 <= j <= n). - -The dictionary must a list of pairs, one per line: - - F E - -where F is an integer of a target token, and E is the integer of a -source token. F may be listed with other Es, and vice versa. - -Important: The dictionary must be sorted by the F integers! - -========================================================================== -Part V: Output File Formats: - -For file names, we will use the prefix "prob_table". This can be -changed using the -o switch. The default is a combination of user id -and time stamp. - - -A. PROBABILITY TABLES - -Normally, Model1 is trained first, and the result is used to start -Model2 training. Then Model2 is transfered to Model3. Model3 viterbi -training follows. This sequence can be adjusted by the various -options, either on the command line or in a config file. - -1. T TABLE ( *.t3.* ) - -(translation table) - - prob_table.t1.n = t table after n iterations of Model1 training - prob_table.t2.n = t table after n iterations of Model2 training - prob_table.t2to3 = t table after transfering Model2 to Model3 - prob_table.t3.n = t table after n iterations of Model3 training - prob_table.4.n = t table after n iterations of Model4 training - -Each line is of the following format: - -s_id t_id P(t_id/s_id) - -where: - s_id: is the unique id for the source token - t_id: is the unique id for the target token - P(t_id/s_id) the probability of translating s_id as t_id - -sample part of a file: - -3599 5697 0.0628115 -2056 10686 0.000259988 -8227 3738 3.57132e-13 -5141 13720 5.52332e-12 -10798 4102 6.53047e-06 -8227 3750 6.97502e-14 -7712 14080 6.0365e-20 -7712 14082 2.68323e-17 -7713 1083 3.94464e-15 -7712 14084 2.98768e-15 - -Similar files will be generated (with the prefix -"prob_table.actual.xxx" that has the actual tokens instead of their -unique ids). This is also true for fertility tables. Also the inverse -probability table will be generated for the final table and it will -have the infix "ti" . - -2. N TABLE ( *.n3.* ) - -(Fertility table) - - prob_table.n2to3 = n table estimated during the transfer from M2 to M3 - ptob_table.n3.X = n table after X iterations of model3 - -Each line in this file is of the following format: - -source_token_id p0 p1 p2 .... pn - -where p0 is the probability that the source token has zero fertility; -p1, fertility one, ...., and n is the maximum possible fertility as -defined in the program. - -sample: - -1 0.475861 0.282418 0.133455 0.0653083 0.0329326 0.00844979 0.0014008 -10 0.249747 0.000107778 0.307767 0.192208 0.0641439 0.15016 0.0358886 -11 0.397111 0.390421 0.19925 0.013382 2.21286e-05 0 0 -12 0.0163432 0.560621 0.374745 0.00231588 0 0 0 -13 1.78045e-07 0.545694 0.299573 0.132127 0.0230494 9.00322e-05 0 -14 1.41918e-18 0.332721 0.300773 0.0334969 0 0 0 -15 0 5.98626e-10 0.47729 0.0230955 0 0 0 -17 0 1.66346e-07 0.895883 0.103948 0 0 0 - - -3. P0 TABLE ( *.p0* ) - -(1 - P0 is the probability of inserting a null after a - source word.) - -This file contains only one line with one real number which is the -value of P0, the probability of not inserting a NULL token. - - -4. A TABLE ( *.a[23].* ) - -The file names follow the naming conventions above. The format of each -line is as follows: - -i j l m p(i | j, l, m) - -where i, j, l, m are all integers and - j = position in target sentence - i = position in source sentence - l = length of source sentence - m = length of target sentence -and p(i/j,l,m) is the probability that a source word in position i is -moved to position j in a pair of sentences of length l and m. - -sample: - -15 14 15 14 0.630798 -15 14 15 15 0.414137 -15 14 15 16 0.268919 -15 14 15 17 0.23171 -15 14 15 18 0.117311 -15 14 15 19 0.119202 -15 14 15 20 0.111369 -15 14 15 21 0.0358169 - - -5. D3 TABLE ( *.d3.* ) - -distortion table - -The format is similar to the A table with a slight difference --- the -position of i & j are switched: - -j i l m p(j/i,l,m) - -sample: - -15 14 14 15 0.286397 -15 14 14 16 0.138898 -15 14 14 17 0.109712 -15 14 14 18 0.0868322 -15 14 14 19 0.0535823 - -6. D4 TABLE: (( *.d4.* ) - -distortion table for IBM-4 - -7. D5 TABLE: ( *.d5.* ) - -distortion table for IBM-5 - -8. HMM TABLE: ( *.hhmm.* ) - -alignment probability table for HMM alignment model - -B. ALIGNMENT FILE ( *.A3.* ) - -In each iteration of the training, and for each sentence pair in the -training set, the best alignment (viterbi alignment) is written to the -alignment file (if the dump parameters are set accordingly). The -alignment file is named prob_table.An.i, where n is the model number -({1,2, 2to3, 3 or 4}), and i is the iteration number. The format of -the alignments file is illustrated in the following sample: - -# Sentence pair (1) -il s' agit de la m�me soci�t� qui a chang� de propri�taires -NULL ({ }) UNK ({ }) UNK ({ }) ( ({ }) this ({ 4 11 }) is ({ }) the ({ }) same ({ 6 }) agency ({ }) which ({ 8 }) has ({ }) undergone ({ 1 2 3 7 9 10 12 }) a ({ }) change ({ 5 }) of ({ }) UNK ({ }) -# Sentence pair (2) -UNK UNK , le propri�taire , dit que cela s' est produit si rapidement qu' il n' en conna�t pas la cause exacte -NULL ({ 4 }) UNK ({ 1 2 }) UNK ({ }) , ({ 3 }) the ({ }) owner ({ 5 22 23 }) , ({ 6 }) says ({ 7 8 }) it ({ }) happened ({ 10 11 12 }) so ({ 13 }) fast ({ 14 19 }) he ({ 16 }) is ({ }) not ({ 20 }) sure ({ 15 17 }) what ({ }) went ({ 18 21 }) wrong ({ 9 }) - -The alignment file is represented by three lines for each sentence -pair. The first line is a label that can be used, e.g., as a caption -for alignment visualization tools. It contains information about the -sentence sequential number in the training corpus, sentence lengths, -and alignment probability. The second line is the target sentence, the -third line is the source sentence. Each token in the source sentence -is followed by a set of zero or more numbers. These numbers represent -the positions of the target words to which this source word is -connected, according to the alignment. - - -C. Perplexity File ( *.perp ) - -This file will be generated at the end of training. It summarizes -perplexity values for each training iteration. Here is a sample -perplexity file that illustrates the format. The format is the same -for cross entropy. If no test corpus was provided, the values for it -will be set to "N/A". - -# train-size test-size iter. model train-perplexity test-perplexity final(y/n) train-viterbi-perp test-viterbi-perp - 447136 9625 0 1 187067 186722 n 3.34328e+06 3.35352e+06 - 447136 9625 1 1 192.88 248.763 n 909.879 1203.13 - 447136 9625 2 1 99.45 139.214 n 316.363 459.745 - 447136 9625 3 1 83.4746 126.046 n 214.612 341.27 - 447136 9625 4 1 78.6939 124.914 n 179.218 303.169 - 447136 9625 5 2 76.6848 125.986 n 161.874 286.226 - 447136 9625 6 2 50.7452 86.2273 n 84.7227 151.701 - 447136 9625 7 2 42.9178 74.5574 n 63.6644 116.034 - 447136 9625 8 2 40.0651 70.7444 n 56.3186 104.274 - 447136 9625 9 2 38.8471 69.4105 n 53.1277 99.6044 - 447136 9625 10 2to3 38.2561 68.9576 n 51.4856 97.4414 - 447136 9625 11 3 129.993 248.885 n 86.6675 165.012 - 447136 9625 12 3 79.2212 169.902 n 86.4842 171.367 - 447136 9625 13 3 75.0746 164.488 n 84.9647 172.639 - 447136 9625 14 3 73.412 162.765 n 83.5762 172.797 - 447136 9625 15 3 72.6107 162.254 y 82.4575 172.688 - - -D. Revised Vocabulary files (*.src.vcb, *.trg.vcb) - -The revised vocabulary files are similar in format to the original -vocabulary files. The only exceptions is that the frequency for each -token is calculated from the given corpus (i.e. it is exact), which is -not required in the input. - -E. final parameter file: ( *.gizacfg ) - -This file includes all the parameter settings that were used in order -to perform this training. This means that starting GIZA using this -parameter file produces (should produce) the same training. - - - -Part VI: LITERATURE -------------------- - -The following two articles include a comparison of the alignment -models implemented in GIZA++: - -@INPROCEEDINGS{och00:isa, - AUTHOR = {F.~J.~Och and H.~Ney}, - TITLE ={Improved Statistical Alignment Models}, - BOOKTITLE = ACL00 , - PAGES ={440--447}, - ADDRESS={ Hongkong, China}, - MONTH = {October}, - YEAR = 2000} - -@INPROCEEDINGS{och00:aco, - AUTHOR = {F.~J.~Och and H.~Ney}, - TITLE = {A Comparison of Alignment Models for Statistical Machine Translation}, - BOOKTITLE = COLING00, - ADDRESS = {Saarbr\"ucken, Germany}, - YEAR = {2000}, - MONTH = {August}, - PAGES = {1086--1090} - } - -The following article describes the statistical machine translation -toolkit EGYPT: - -@MISC{ alonaizan99:smt, -AUTHOR = {Y. Al-Onaizan and J. Curin and M. Jahr and K. Knight and J. Lafferty and I. D. Melamed and F. J. Och and D. Purdy and N. A. Smith and D. Yarowsky}, -TITLE = {Statistical Machine Translation, Final Report, {JHU} Workshop}, -YEAR = {1999}, -ADDRESS = {Baltimore, Maryland, MD}, -NOTE={{\tt http://www.clsp.jhu.edu/ws99/projects/ mt/final\_report/mt-final-report.ps}} -} - - -The implemented alignment models IBM-1 to IBM-5 and HMM were originally described in: - -@ARTICLE{brown93:tmo, - AUTHOR = {Brown, P. F. and Della Pietra, S. A. and Della Pietra, V. J. and Mercer, R. L.}, - TITLE = {The Mathematics of Statistical Machine Translation: Parameter Estimation}, - JOURNAL = {Computational Linguistics}, - YEAR = 1993, - VOLUME = 19, - NUMBER = 2, - PAGES = {263--311} -} - -@INPROCEEDINGS{ vogel96:hbw, - AUTHOR = {Vogel, S. and Ney, H. and Tillmann, C.}, - TITLE = {{HMM}-Based Word Alignment in Statistical Translation}, - YEAR = 1996, - PAGES = {836--841}, - MONTH = {August}, - ADDRESS = {Copenhagen}, - BOOKTITLE = COLING96 -} - - -Part VII: New features -====================== - -2003-06-09: - -- new parameter "-nbestalignments N": prints an N-best list of - alignments into a file *.NBEST - -- If program is compiled with "-DBINARY_SEARCH_FOR_TTABLE", it uses - more memory-efficient data structures for the t table (vector with - binary search instead of hash table). Then, the program expects a - parameter "-CoocurrenceFile FILE" which specifies a file which - includes all lexical coccurrences in the training corpus. This file - can be produced by the snt2cooc.out tool. - - diff --git a/ext/giza-pp/GIZA++-v2/TTables.cpp b/ext/giza-pp/GIZA++-v2/TTables.cpp deleted file mode 100644 index 25c126f7..00000000 --- a/ext/giza-pp/GIZA++-v2/TTables.cpp +++ /dev/null @@ -1,323 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "TTables.h" -#include "Parameter.h" - -GLOBAL_PARAMETER(float,PROB_CUTOFF,"PROB CUTOFF","Probability cutoff threshold for lexicon probabilities",PARLEV_OPTHEUR,1e-7); -GLOBAL_PARAMETER2(float, COUNTINCREASE_CUTOFF,"COUNTINCREASE CUTOFF","countCutoff","Counts increment cutoff threshold",PARLEV_OPTHEUR,1e-6); - -#ifdef BINARY_SEARCH_FOR_TTABLE -template -void tmodel::printCountTable(const char *, - const Vector&, - const Vector&, - const bool) const -{ -} - -template -void tmodel::printProbTable(const char *filename, - const Vector& evlist, - const Vector& fvlist, - const bool actual) const -{ - ofstream of(filename); - /* for(unsigned int i=0;isize();++j) - { - const CPPair&x=(*lexmat[i])[j].second; - WordIndex e=i,f=(*lexmat[i])[j].first; - if( x.prob>PROB_SMOOTH ) - if( actual ) - of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n'; - else - of << e << ' ' << f << ' ' << x.prob << '\n'; - } - } -} - -template -void tmodel::printProbTableInverse(const char *, - const Vector&, - const Vector&, - const double, - const double, - const bool ) const -{ -} -template -void tmodel::normalizeTable(const vcbList&, const vcbList&, int) -{ - for(unsigned int i=0;isize(); - for(unsigned int j=0;j -void tmodel::readProbTable(const char *){ -} - -template class tmodel ; -#else -/* ------------------ Method Definiotns for Class tmodel --------------------*/ - -# -template -void tmodel::printCountTable(const char *filename, - const Vector& evlist, - const Vector& fvlist, - const bool actual) const - // this function dumps the t table. Each line is of the following format: - // - // c(target_word/source_word) source_word target_word -{ - ofstream of(filename); - typename hash_map >::const_iterator i; - for(i = ef.begin(); i != ef.end();++i){ - if ( ((*i).second).count > COUNTINCREASE_CUTOFF) - if (actual) - of << ((*i).second).count << ' ' << evlist[ ((*i).first).first ].word << ' ' << fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n'; - else - of << ((*i).second).count << ' ' << ((*i).first).first << ' ' << ((*i).first).second << ' ' << (*i).second.prob << '\n'; - } -} - -template -void tmodel::printProbTable(const char *filename, - const Vector& evlist, - const Vector& fvlist, - const bool actual) const - // this function dumps the t table. Each line is of the following format: - // - // source_word target_word p(target_word/source_word) -{ - ofstream of(filename); - typename hash_map >::const_iterator i; - for(i = ef.begin(); i != ef.end();++i) - if( actual ) - of << evlist[((*i).first).first].word << ' ' << - fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n'; - else - of << ((*i).first).first << ' ' << ((*i).first).second << ' ' << - (*i).second.prob << '\n'; -} - -template -void tmodel::printProbTableInverse(const char *filename, - const Vector& evlist, - const Vector& fvlist, - const double, - const double, - const bool actual) const - // this function dumps the inverse t table. Each line is of the format: - // - // target_word_id source_word_id p(source_word/target_word) - // - // if flag "actual " is true then print actual word entries instead of - // token ids -{ - cerr << "Dumping the t table inverse to file: " << filename << '\n'; - ofstream of(filename); - typename hash_map >::const_iterator i; - PROB p_inv = 0 ; - // static const PROB ratio(double(fTotal)/eTotal); - WordIndex e, f ; - int no_errors(0); - vector total(fvlist.size(),PROB(0)) ; // Sum over all e of P(f/e) * p(e) - needed for normalization - - for(i = ef.begin(); i != ef.end(); i++){ - e = ((*i).first).first ; - f = ((*i).first).second ; - total[f] += (PROB) evlist[e].freq * ((*i).second.prob); //add P(f/ei) * F(ei) - } - - for(i = ef.begin(); i != ef.end(); i++){ - e = ((*i).first).first ; - f = ((*i).first).second ; - p_inv = ((*i).second.prob) * (PROB) evlist[e].freq / total[f] ; - if (p_inv > 1.0001 || p_inv < 0){ - no_errors++; - if (no_errors <= 10){ - cerr << "printProbTableInverse(): Error - P("< >::const_iterator i; - PROB p_inv = 0 ; - static const PROB ratio(double(fTotal)/eTotal); - WordIndex e, f ; - for(i = ef.begin(); i != ef.end(); i++){ - e = ((*i).first).first ; - f = ((*i).first).second ; - p_inv = ((*i).second.prob) * ratio * (PROB) evlist[e].freq / - (PROB) fvlist[f].freq ; - if (actual) - of << fvlist[f].word << ' ' << evlist[e].word << ' ' << p_inv << '\n'; - else - of << f << ' ' << e << ' ' << p_inv << '\n'; - } -} -*/ -template -void tmodel::normalizeTable(const vcbList&engl, const vcbList&french, int iter) - // normalize conditional probability P(fj/ei): - // i.e. make sure that Sum over all j of P(fj/e) = 1 - // this method reads the counts portion of the table and normalize into - // the probability portion. Then the counts are cleared (i.e. zeroed) - // if the resulting probability of an entry is below a threshold, then - // remove it . -{ - if( iter==2 ) - { - total2.resize(engl.uniqTokens());for(unsigned int i=0;i total(engl.uniqTokens(),0.0); - //Vector nFrench(engl.uniqTokens(), 0); - //Vector nEng(french.uniqTokens(), 0); - - typename hash_map >::const_iterator i; - for(i = ef.begin(); i != ef.end(); i++){ // for all possible source words e - if( iter==2 ) - total2[((*i).first).first] += (*i).second.count; - total[((*i).first).first] += (*i).second.count; - nFrench[((*i).first).first]++; - nEng[((*i).first).second]++; - } - for(unsigned int k=0;k >::iterator j, k; - PROB p ; - int nParams=0; - for(j = ef.begin(); j != ef.end(); ){ - k = j; - k++ ; - if( (total[((*j).first).first])>0.0 ) - p = ((((*j).second).count) /(total[((*j).first).first])) ; - else - p= 0.0; - if (p > PROB_CUTOFF) - { - if( iter>0 ) - { - ((*j).second).prob = 0 ; - ((*j).second).count = p ; - } - else - { - ((*j).second).prob = p ; - ((*j).second).count = 0 ; - } - nParams++; - } - else { - erase(((*j).first).first, ((*j).first).second); - } - j = k ; - } - if( iter>0 ) - return normalizeTable(engl, french, iter-1); - else - { - } -} - -template -void tmodel::readProbTable(const char *filename){ - /* This function reads the t table from a file. - Each line is of the format: source_word_id target_word_id p(target_word|source_word) - This is the inverse operation of the printTable function. - NAS, 7/11/99 - */ - ifstream inf(filename); - cerr << "Reading t prob. table from " << filename << "\n"; - if(!inf){ - cerr << "\nERROR: Cannot open " << filename << "\n"; - return; - } - WordIndex src_id, trg_id; - PROB prob; - int nEntry=0; - while( inf >> src_id >> trg_id >> prob){ - insert(src_id, trg_id, 0.0, prob); - nEntry++; - } - cerr << "Read " << nEntry << " entries in prob. table.\n"; -} - -template class tmodel ; - -/* ---------------- End of Method Definitions of class tmodel ---------------*/ - - -#endif diff --git a/ext/giza-pp/GIZA++-v2/TTables.h b/ext/giza-pp/GIZA++-v2/TTables.h deleted file mode 100644 index 85673ef0..00000000 --- a/ext/giza-pp/GIZA++-v2/TTables.h +++ /dev/null @@ -1,417 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/* --------------------------------------------------------------------------* - * * - * Module : TTables * - * * - * Prototypes File: TTables.h * - * * - * Objective: Defines clases and methods for handling I/O for Probability & * - * Count tables and also alignment tables * - *****************************************************************************/ - -#ifndef _ttables_h -#define _ttables_h 1 - - -#include "defs.h" -#include "vocab.h" - -#include - -#include -#include -#include -#include -#include -#include "Vector.h" -#include - -#include - -#include "Globals.h" - - -/* The tables defined in the following classes are defined as hash tables. For - example. the t-table is a hash function of a word pair; an alignment is - a hash function of a vector of integer numbers (sentence positions) and so - on */ - - -/*----------- Defnition of Hash Function for class tmodel ------- -----------*/ - -typedef pair wordPairIds; - - -class hashpair : public unary_function< pair, size_t > -{ -public: - size_t operator() (const pair& key) const - { - return (size_t) MAX_W*key.first + key.second; /* hash function and it - is guarnteed to have - unique id for each - unique pair */ - } -}; - - - -/* ------------------ Class Prototype Definitions ---------------------------* - Class Name: tmodel - Objective: This defines the underlying data structur for t Tables and t - Count Tables. They are defined as a hash table. Each entry in the hash table - is the probability (P(fj/ei) ) or count collected for ( C(fj/ei)). The - probability and the count are represented as log integer probability as - defined by the class LogProb . - - This class is used to represents t Tables (probabiliity) and n (fertility - Tables and also their corresponding count tables . - - *---------------------------------------------------------------------------*/ - -//typedef float COUNT ; -//typedef LogProb PROB ; -template -class LpPair { - public: - COUNT count ; - PROB prob ; - public: // constructor - LpPair():count(0), prob(0){} ; - LpPair(COUNT c, PROB p):count(c), prob(p){}; -} ; - -#ifdef BINARY_SEARCH_FOR_TTABLE - - -template -T*mbinary_search(T*x,T*y,unsigned int val) -{ - if( y-x==0 ) - return 0; - if( x->first==val) - return x; - if( y-x<2 ) - return 0; - T*mid=x+(y-x)/2; - if( val < mid->first ) - return mbinary_search(x,mid,val); - else - return mbinary_search(mid,y,val); - -} - -template -const T*mbinary_search(const T*x,const T*y,unsigned int val) -{ - if( y-x==0 ) - return 0; - if( x->first==val) - return x; - if( y-x<2 ) - return 0; - const T*mid=x+(y-x)/2; - if( val < mid->first ) - return mbinary_search(x,mid,val); - else - return mbinary_search(mid,y,val); - -} - -template -class tmodel{ - typedef LpPair CPPair; - public: - int noEnglishWords; // total number of unique source words - int noFrenchWords; // total number of unique target words - //vector > fs; - //vector es; - vector< vector >* > lexmat; - - void erase(WordIndex e, WordIndex f) - { - CPPair *p=find(e,f); - if(p) - *p=CPPair(0,0); - }; - CPPair*find(int e,int f) - { - //pair *be=&(fs[0])+es[e]; - //pair *en=&(fs[0])+es[e+1]; - pair *be=&(*lexmat[e])[0]; - pair *en=&(*lexmat[e])[0]+(*lexmat[e]).size(); - pair *x= mbinary_search(be,en,f); - if( x==0 ) - { - //cerr << "A:DID NOT FIND ENTRY: " << e << " " << f << '\n'; - //abort(); - return 0; - } - return &(x->second); - } - const CPPair*find(int e,int f)const - { - const pair *be=&(*lexmat[e])[0]; - const pair *en=&(*lexmat[e])[0]+(*lexmat[e]).size(); - //const pair *be=&(fs[0])+es[e]; - //const pair *en=&(fs[0])+es[e+1]; - const pair *x= mbinary_search(be,en,f); - if( x==0 ) - { - //cerr << "B:DID NOT FIND ENTRY: " << e << " " << f << '\n'; - //abort(); - return 0; - } - - return &(x->second); - } -public: - void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){ - *find(e,f)=CPPair(cval,pval); - } - CPPair*getPtr(int e,int f){return find(e,f);} - tmodel(const string&fn) - { - int count=0,count2=0; - ifstream infile2(fn.c_str()); - int e,f,olde=-1,oldf=-1; - pair cp; - vector< pair > cps; - while(infile2>>e>>f) - { - cp.first=f; - assert(e>=olde); - assert(e>olde ||f>oldf); - if( e!=olde&&olde>=0 ) - { - int oldsize=lexmat.size(); - lexmat.resize(olde+1); - for(unsigned int i=oldsize;i > (cps); - cps.clear(); - if( !((*lexmat[olde]).size()==(*lexmat[olde]).capacity()) ) - cerr << "eRROR: waste of memory: " << (*lexmat[olde]).size() << " " << (*lexmat[olde]).capacity() << endl; - count2+=lexmat[olde]->capacity(); - } - cps.push_back(cp); - olde=e; - oldf=f; - count++; - } - lexmat.resize(olde+1); - lexmat[olde]=new vector< pair > (cps); - count2+=lexmat[olde]->capacity(); - cout << "There are " << count << " " << count2 << " entries in table" << '\n'; - } - - - /* tmodel(const string&fn) - { - size_t count=0; - { - ifstream infile1(fn.c_str()); - if( !infile1 ) - { - cerr << "ERROR: can't read coocurrence file " << fn << '\n'; - abort(); - } - int e,f; - while(infile1>>e>>f) - count++; - } - cout << "There are " << count << " entries in table" << '\n'; - ifstream infile2(fn.c_str()); - fs.resize(count); - int e,f,olde=-1,oldf=-1; - pair cp; - count=0; - while(infile2>>e>>f) - { - assert(e>=olde); - assert(e>olde ||f>oldf); - if( e!=olde ) - { - es.resize(e+1); - for(unsigned int i=olde+1;int(i)<=e;++i) - es[i]=count; - } - cp.first=f; - assert(countcount += inc ; - } - } - - PROB getProb(WordIndex e, WordIndex f) const - { - const CPPair *p=find(e,f); - if( p ) - return max(p->prob, PROB_SMOOTH); - else - return PROB_SMOOTH; - } - - COUNT getCount(WordIndex e, WordIndex f) const - { - const CPPair *p=find(e,f); - if( p ) - return p->count; - else - return 0.0; - } - - void printProbTable(const char* filename, const Vector&, const Vector&,bool actual) const; - void printCountTable(const char* filename, const Vector&, const Vector&,bool actual) const; - void printProbTableInverse(const char *filename, - const Vector& evlist, - const Vector& fvlist, - const double eTotal, - const double fTotal, - const bool actual = false ) const; - void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2); - void readProbTable(const char *filename); -}; - - -#else - - -template -class tmodel{ - typedef LpPair CPPair; - public: - int noEnglishWords; // total number of unique source words - int noFrenchWords; // total number of unique target words - hash_map > ef; - void erase(WordIndex e, WordIndex f) - // In: a source and a target token ids. - // removes the entry with that pair from table - { - ef.erase(wordPairIds(e, f)); - }; - -public: - Vector total2; - Vector nFrench; - Vector nEng; - - - // methods; - - // insert: add entry P(fj/ei) to the hash function, Default value is 0.0 - void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){ - ef[wordPairIds(e, f)].count = cval ; - ef[wordPairIds(e, f)].prob = pval ; - } - - // returns a reference to the word pair, if does not exists, it creates it. - CPPair&getRe(WordIndex e, WordIndex f) - {return ef[wordPairIds(e, f)];} - - // returns a pointer to an existing word pair. if pair does not exists, - // the method returns the zero pointer (NULL) - - CPPair*getPtr(WordIndex e, WordIndex f) - { - // look up this pair and return its position - typename hash_map >::iterator i = ef.find(wordPairIds(e, f)); - if(i != ef.end()) // if it exists, return a pointer to it. - return(&((*i).second)); - else return(0) ; // else return NULL pointer - } - - void incCount(WordIndex e, WordIndex f, COUNT inc) - // increments the count of the given word pair. if the pair does not exist, - // it creates it with the given value. - { - if( inc ) - ef[wordPairIds(e, f)].count += inc ; - } - - PROB getProb(WordIndex e, WordIndex f) const - // read probability value for P(fj/ei) from the hash table - // if pair does not exist, return floor value PROB_SMOOTH - { - typename hash_map >::const_iterator i= ef.find(wordPairIds(e, f)); - if(i == ef.end()) - return PROB_SMOOTH; - else - return max(((*i).second).prob, PROB_SMOOTH); - } - - COUNT getCount(WordIndex e, WordIndex f) const - /* read count value for entry pair (fj/ei) from the hash table */ - { - typename hash_map >::const_iterator i= ef.find(wordPairIds(e, f)); - if(i == ef.end()) - return 0; - else - return ((*i).second).count; - } - - inline const hash_map >& getHash(void) const {return ef;}; - /* get a refernece to the hash table */ - //inline void resize(WordIndex n) {ef.resize(n);}; - // to resize he hash table - - void printProbTable(const char* filename, const Vector&, const Vector&,bool actual) const; - void printCountTable(const char* filename, const Vector&, const Vector&,bool actual) const; - // print the t table to the given file but this time print actual source and - // target words instead of thier token ids - - void printProbTableInverse(const char *filename, - const Vector& evlist, - const Vector& fvlist, - const double eTotal, - const double fTotal, - const bool actual = false ) const; - // dump inverse of t table (i.e P(ei/fj)) to the given file name, - // if the given flag is true then actual words are printed not token ids - - void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2); - // to norlmalize the table i.e. make sure P(fj/ei) for all j is equal to 1 - - void readProbTable(const char *filename); - // void readAsFertilityTable(const char *filename); -}; -/*--------------- End of Class Definition for tmodel -----------------------*/ - -#endif - -#endif diff --git a/ext/giza-pp/GIZA++-v2/Vector.h b/ext/giza-pp/GIZA++-v2/Vector.h deleted file mode 100644 index 96d26ad7..00000000 --- a/ext/giza-pp/GIZA++-v2/Vector.h +++ /dev/null @@ -1,427 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/*-- -Vector: checked vector implementation - -Franz Josef Och (30/07/99) ---*/ -#ifndef ARRAY_H_DEFINED -#define ARRAY_H_DEFINED -#include "mystl.h" -#include -#include -#include -#include -#include - - -#ifdef NDEBUG - -#include -#define Vector vector -template ostream& operator<<(ostream&o, const Vector&a) -{ - o << "Vector(" << a.size() << "){ "; - for(unsigned int iii=0;iii class Vector -{ - private: - T *p; - int realSize; - int maxWritten; - - void copy(T *a, const T *b, int n); - void copy(T *a, T *b, int n); - void _expand(); - - public: - Vector() - : p(0), realSize(0), maxWritten(-1) - { -#ifdef VERY_ARRAY_DEBUG - cout << "MAKE ARRAY: " << this<<" "<<(void*)p << '\n'; -#endif - } - Vector(const Vector &x) - : p(new T[x.maxWritten+1]), realSize(x.maxWritten+1), maxWritten(x.maxWritten) - { - memo_new(p); - copy(p, x.p, realSize); -#ifdef VERY_ARRAY_DEBUG - cout << "MAKE ARRAY copy: " << this << " " << realSize <<" "<<(void*)p<< '\n'; -#endif - } - explicit Vector(int n) - : p(new T[n]), realSize(n), maxWritten(n-1) - { - memo_new(p); -#ifdef VERY_ARRAY_DEBUG - cout << "MAKE ARRAY with parameter n: " << this << " " << realSize<<" "<<(void*)p << '\n'; -#endif - } - Vector(int n, const T&_init) - : p(new T[n]), realSize(n), maxWritten(n-1) - { - memo_new(p); - for(int iii=0;iii& operator=(const Vector&x) - { - if( this!= &x ) - { -#ifdef VERY_ARRAY_DEBUG - cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n'; -#endif - delete [] p; - memo_del(p, 1); - realSize = x.maxWritten+1; - maxWritten = x.maxWritten; - p = new T[realSize]; - memo_new(p); - copy(p, x.p, realSize); -#ifdef VERY_ARRAY_DEBUG - cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n'; -#endif - } - return *this; - } - - Vector& operator=(Vector&x) - { - if( this!= &x ) - { -#ifdef VERY_ARRAY_DEBUG - cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n'; -#endif - delete [] p; - memo_del(p, 1); - realSize = x.maxWritten+1; - maxWritten = x.maxWritten; - p = new T[realSize]; - memo_new(p); - copy(p, x.p, realSize); -#ifdef VERY_ARRAY_DEBUG - cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n'; -#endif - } - return *this; - } - - void allowAccess(int n) - { - while( realSize<=n ) - _expand(); - maxWritten=max(maxWritten, n); - assert( maxWritten()); - } - void init(int n, const T&_init) - { -#ifdef VERY_ARRAY_DEBUG - cout << "FREE ARRAY because of init: " << this << " " << realSize<<" "<<(void*)p << '\n'; -#endif - delete []p; - memo_del(p, 1); - p=new T[n]; - memo_new(p); - realSize=n; - maxWritten=n-1; - for(int iii=0;iiimaxWritten ) - errorAccess(n); -#endif - return p[n]; - } - inline const T& operator[](int n) const - { -#ifndef NDEBUG - if(n<0 || n>maxWritten ) - errorAccess(n); -#endif - return p[n]; - } - inline const T& get(int n) const - { -#ifndef NDEBUG - if(n<0 || n>maxWritten ) - errorAccess(n); -#endif - return p[n]; - } - const T&top(int n=0) const - {return (*this)[maxWritten-n];} - T&top(int n=0) - {return (*this)[maxWritten-n];} - const T&back(int n=0) const - {return (*this)[maxWritten-n];} - T&back(int n=0) - {return (*this)[maxWritten-n];} - T&push_back(const T&x) - { - allowAccess(maxWritten+1); - (*this)[maxWritten]=x; - return top(); - } - /* - bool writeTo(ostream&out) const - { - out << "Vector "; - out << size() << " "; - out << a << '\n'; - for(int iv=0;iv<=maxWritten;iv++) - { - writeOb(out, (*this)[iv]); - out << '\n'; - } - return 1; - } - */ - - bool readFrom(istream&in) - { - string s; - if( !in ) - { - cerr << "ERROR(Vector): file cannot be opened.\n"; - return 0; - } - in >> s; - if( !(s=="Vector") ) - { - cerr << "ERROR(Vector): Vector!='"<> biggest; - in >> a; - resize(biggest); - for(int iv=0;iv bool operator==(const Vector &x, const Vector &y) -{ - if( &x == &y ) - return 1; - else - { - if( y.size()!=x.size() ) - return 0; - else - { - for(unsigned int iii=0;iii bool operator!=(const Vector &x, const Vector &y) -{ - return !(x==y); -} - -template bool operator<(const Vector &x, const Vector &y) -{ - if( &x == &y ) - return 0; - else - { - if( y.size() void Vector:: errorAccess(int n) const -{ - cerr << "ERROR: Access to array element " << n - << " (" << maxWritten << ", " << realSize << ", " << (void*)p << ")\n"; - cout << "ERROR: Access to array element " << n - << " (" << maxWritten << ", " << realSize << ", " << (void*)p << ")\n"; - assert(0); -#ifndef DEBUG - abort(); -#endif -} - -template ostream& operator<<(ostream&o, const Vector&a) -{ - o << "Vector(" << a.size() << "){ "; - for(unsigned int iii=0;iii istream& operator>>(istream&in, Vector&) -{return in;} - -template int Hash(const Vector&a) -{ - int n=0; - for(int iii=0;iii void Vector::copy(T *aa, const T *bb, int n) -{ - for(int iii=0;iii void Vector::copy(T *aa, T *bb, int n) -{ - for(int iii=0;iii void Vector::_expand() -{ -#ifdef VERY_ARRAY_DEBUG - cout << "FREE ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << '\n'; -#endif - T *oldp=p; - int oldsize=realSize; - realSize=realSize*2+1; - p=new T[realSize]; - memo_new(p); - copy(p, oldp, oldsize); - delete [] oldp; - memo_del(oldp, 1); -#ifdef VERY_ARRAY_DEBUG - cout << "NEW ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << '\n'; -#endif -} - -template int Vector::findMax() const -{ - if( size()==0 ) - return -1; - else - { - int maxPos=0; - for(int iii=1;iii int Vector::findMin() const -{ - if( size()==0 ) - return -1; - else - { - int minPos=0; - for(int iii=1;iii -#include -#include - -class WordClasses -{ - private: - map Sw2c; - map Sc2int; - Vector Sint2c; - Vector w2c; - unsigned int classes; - public: - WordClasses() - : classes(1) - { - Sint2c.push_back("0"); - Sc2int["0"]=0; - } - template bool read(istream&in,const MAPPER&m) - { - string sline; - int maxword=0; - while(getline(in,sline)) - { - string word,wclass; - //istringstream iline(sline.c_str()); - istringstream iline(sline); - iline>>word>>wclass; - maxword=max(m(word),maxword); - assert(Sw2c.count(word)==0); - Sw2c[word]=wclass; - if( !Sc2int.count(wclass) ) - { - Sc2int[wclass]=classes++; - Sint2c.push_back(wclass); - assert(classes==Sint2c.size()); - } - } - w2c=Vector(maxword+1,0); - for(map::const_iterator i=Sw2c.begin();i!=Sw2c.end();++i) - w2c[m(i->first)]=Sc2int[i->second]; - cout << "Read classes: #words: " << maxword << " " << " #classes: "<< classes <=0&&int(w)second; - else - { - cerr << "WARNING: class " << x << " not found.\n"; - return 0; - } - } - string classString(unsigned int cnr)const - { - if( cnr -#include "defs.h" -#include "myassert.h" - -class al_struct -{ - public: - al_struct() - : prev(0),next(0){} - PositionIndex prev,next; -}; - - -class alignment -{ - private: - Vector a; - Vector positionSum,f; - public: - Vector als_i; - Vector als_j; - PositionIndex l,m; - alignment() - {} - alignment(PositionIndex _l, PositionIndex _m) - : a(_m+1, (PositionIndex)0), - positionSum(_l+1, (PositionIndex)0), f(_l+1, (PositionIndex)0), als_i(_l+1,0),als_j(_m+1),l(_l), m(_m) - { - f[0]=m; - for(PositionIndex j=1;j<=m;j++) - { - if( j>1 ) - als_j[j].prev= j-1; - if( j0); - massert(j>0); - positionSum[old_aj]-=j; - // ausfuegen - PositionIndex prev=als_j[j].prev; - PositionIndex next=als_j[j].next; - if( next ) - als_j[next].prev=prev; - if( prev ) - als_j[prev].next=next; - else - als_i[old_aj]=next; - - // neue Position suchen - PositionIndex lfd=als_i[aj],llfd=0; - while( lfd && lfd& getAlignment() const - {return a ;} - PositionIndex get_al(PositionIndex j)const - { - massert(jm ) - return 0; - for(unsigned int i=1;i<=l;i++) - if( f[i]>=MAX_FERTILITY ) - return 0; - return 1; - } - friend class transpair_model5; -}; -#endif diff --git a/ext/giza-pp/GIZA++-v2/collCounts.cpp b/ext/giza-pp/GIZA++-v2/collCounts.cpp deleted file mode 100644 index 6e6ef69d..00000000 --- a/ext/giza-pp/GIZA++-v2/collCounts.cpp +++ /dev/null @@ -1,293 +0,0 @@ -/* - -Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "alignment.h" -#include "transpair_model3.h" -#include -#include "collCounts.h" -#include "MoveSwapMatrix.h" -#include "D5Tables.h" -#include "transpair_model5.h" -#include "transpair_modelhmm.h" -#include "Parameter.h" - -extern float COUNTINCREASE_CUTOFF_AL; -// unifies collectCountsOverAlignments and findAlignmentNeighborhood FJO-20/07/99 -template -int collectCountsOverNeighborhood(const MoveSwapMatrix&msc,LogProb ascore,Array2 >&dtcount,Array2 >&ncount,LogProb&p1count,LogProb&p0count,LogProb&total_count) -{ - int nAl=0; - const PositionIndex l=msc.get_l(),m=msc.get_m(); - Array2 > cmove(l+1,m+1),cswap(l+1,m+1); - Vector negmove(m+1),negswap(m+1),plus1fert(l+1),minus1fert(l+1); - LogProb total_move,total_swap; - if( msc.isCenterDeleted()==0 ) - { - total_move+=ascore; - nAl++; - } - for(PositionIndex j=1;j<=m;j++) - for(PositionIndex i=0;i<=l;i++) - if( msc(j)!=i && !msc.isDelMove(i,j) ) - { - LogProb newscore=ascore*msc.cmove(i,j); - total_move+=newscore; - nAl++; - cmove(i,j)+=newscore; - negmove[j]+=newscore; - plus1fert[i]+=newscore; - minus1fert[msc(j)]+=newscore; - } - for(PositionIndex j1=1;j1<=m;j1++) - for(PositionIndex j2=j1+1;j2<=m;j2++) - if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) ) - { - LogProb newscore=ascore*msc.cswap(j1,j2); - total_swap+=newscore; - nAl++; - cswap(msc(j1),j2)+=newscore; - cswap(msc(j2),j1)+=newscore; - negswap[j1]+=newscore; - negswap[j2]+=newscore; - } - total_count+=total_move+total_swap; - for(PositionIndex j=1;j<=m;j++) - for(PositionIndex i=0;i<=l;i++) - dtcount(i,j) += ((i==msc(j)) ? (total_count-(negmove[j]+negswap[j])) : (cswap(i,j)+cmove(i,j))); - for(PositionIndex i=1;i<=l;i++) - { - LogProb temp=minus1fert[i]+plus1fert[i]; - if( msc.fert(i)0&&msc.fert(i)-10 ) - { - p1count += (minus1fert[0])*(LogProb)(msc.fert(0)-1); - p0count += (minus1fert[0])*(LogProb)(m-2*(msc.fert(0)-1)); - } - else - if( minus1fert[0]!=0.0 ) - cerr << "ERROR: M1Fb: " << minus1fert[0] << endl; - if(int(m)-2*(int(msc.fert(0))+1)>=0) - { - p1count += (plus1fert[0])*(LogProb)(msc.fert(0)+1); - p0count += (plus1fert[0])*(LogProb)(m-2*(msc.fert(0)+1)); - } - msc.check(); - return nAl; -}; - -template -double collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix&,LogProb,void*) -{ - return 0.0; -} - -template -void _collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix&Mmsc,const alignment&msc,const TRANSPAIR&ef,LogProb normalized_ascore,d4model*d4Table) -{ - Mmsc.check(); - const PositionIndex m=msc.get_m(),l=msc.get_l(); - for(PositionIndex j=1;j<=m;++j) - if( msc(j)!=0 ) - if( msc.get_head(msc(j))==j) - { - int ep=msc.prev_cept(msc(j)); - //massert( &d4Table->getCountRef_first(j,msc.get_center(ep),d4Table->ewordclasses.getClass(ef.get_es(ep)),d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m) == ef.getCountFirst(ep,j,msc.get_center(ep))); - d4Table->getCountRef_first(j,msc.get_center(ep),d4Table->ewordclasses.getClass(ef.get_es(ep)),d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m)+=normalized_ascore; - } - else - { - //massert( &d4Table->getCountRef_bigger(j,msc.prev_in_cept(j),0,d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m) == ef.getCountSecond(j,msc.prev_in_cept(j) )); - d4Table->getCountRef_bigger(j,msc.prev_in_cept(j),0,d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m)+=normalized_ascore; - } -} - -template -void _collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix&Mmsc,const alignment&msc,const TRANSPAIR&ef,LogProb normalized_ascore,d5model*d5Table) -{ - Mmsc.check(); - _collectCountsOverNeighborhoodForSophisticatedModels(Mmsc,msc,ef,normalized_ascore,&d5Table->d4m); - Mmsc.check(); - const PositionIndex m=msc.get_m(),l=msc.get_l(); - PositionIndex prev_cept=0; - PositionIndex vac_all=m; - Vector vac(m+1,0); - for(PositionIndex i=1;i<=l;i++) - { - PositionIndex cur_j=msc.als_i[i]; - PositionIndex prev_j=0; - PositionIndex k=0; - if(cur_j) { // process first word of cept - k++; - d5Table->getCountRef_first(vacancies(vac,cur_j),vacancies(vac,msc.get_center(prev_cept)), - d5Table->fwordclasses.getClass(ef.get_fs(cur_j)),l,m,vac_all-msc.fert(i)+k)+=normalized_ascore; - vac_all--; - assert(vac[cur_j]==0); - vac[cur_j]=1; - Mmsc.check(); - prev_j=cur_j; - cur_j=msc.als_j[cur_j].next; - } - while(cur_j) { // process following words of cept - k++; - int vprev=vacancies(vac,prev_j); - d5Table->getCountRef_bigger(vacancies(vac,cur_j),vprev,d5Table->fwordclasses.getClass(ef.get_fs(cur_j)),l,m,vac_all-vprev/*war weg*/-msc.fert(i)+k)+=normalized_ascore; - vac_all--; - vac[cur_j]=1; - Mmsc.check(); - prev_j=cur_j; - cur_j=msc.als_j[cur_j].next; - } - assert(k==msc.fert(i)); - if( k ) - prev_cept=i; - } - assert(vac_all==msc.fert(0)); -} - -extern int NumberOfAlignmentsInSophisticatedCountCollection; - -template -double collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix&msc,LogProb normalized_ascore,MODEL*d5Table) -{ - const PositionIndex m=msc.get_m(),l=msc.get_l(); - alignment x(msc); - double sum=0; - msc.check(); - if( !msc.isCenterDeleted() ) - { - _collectCountsOverNeighborhoodForSophisticatedModels(msc,x,msc.get_ef(),normalized_ascore,d5Table); - NumberOfAlignmentsInSophisticatedCountCollection++; - sum+=normalized_ascore; - } - msc.check(); - for(WordIndex j=1;j<=m;j++)for(WordIndex i=0;i<=l;i++) - { - WordIndex old=x(j); - if( i!=old&& !msc.isDelMove(i,j) ) - { - msc.check(); - double c=msc.cmove(i,j)*normalized_ascore; - if(c > COUNTINCREASE_CUTOFF_AL ) - { - x.set(j,i); - _collectCountsOverNeighborhoodForSophisticatedModels(msc,x,msc.get_ef(),c,d5Table); - NumberOfAlignmentsInSophisticatedCountCollection++; - x.set(j,old); - sum+=c; - } - msc.check(); - } - } - for(PositionIndex j1=1;j1<=m;j1++) - for(PositionIndex j2=j1+1;j2<=m;j2++) - if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) ) - { - double c=msc.cswap(j1,j2)*normalized_ascore; - msc.check(); - if(c > COUNTINCREASE_CUTOFF_AL ) - { - int old1=msc(j1),old2=msc(j2); - x.set(j1,old2); - x.set(j2,old1); - _collectCountsOverNeighborhoodForSophisticatedModels(msc,x,msc.get_ef(),c,d5Table); - NumberOfAlignmentsInSophisticatedCountCollection++; - x.set(j1,old1); - x.set(j2,old2); - sum+=c; - } - msc.check(); - } - msc.check(); - return sum; -} - -template -int collectCountsOverNeighborhood(const Vector*,LogProb> >&smsc,Vector&es,Vector&fs,tmodel&tTable,amodel&aCountTable,amodel&dCountTable,nmodel&nCountTable,double&p1count,double&p0count,LogProb&_total,float count,bool addCounts,MODEL*d4Table) -{ - int nAl=0; - const PositionIndex l=es.size()-1,m=fs.size()-1; - Array2 > dtcount(l+1,m+1),ncount(l+1,MAX_FERTILITY+1); - LogProb p0=0,p1=0,all_total=0; - for(unsigned int i=0;iCOUNTINCREASE_CUTOFF_AL ) - { - tTable.incCount(es[i],fs[j],ijadd); - dCountTable.getRef(j,i,l,m)+=ijadd; - aCountTable.getRef(i,j,l,m)+=ijadd; - } - } - if( i>0 ) - for(PositionIndex n=0;n -#include "MoveSwapMatrix.h" -#include "D4Tables.h" -#include "transpair_model4.h" - -class OneMoveSwap -{ - public: - short type; - short a,b; - OneMoveSwap(short _type,short _a,short _b) - : type(_type),a(_a),b(_b) - {} - OneMoveSwap() - : type(0){} -}; - -inline bool operator<(const OneMoveSwap&a,const OneMoveSwap&b) -{ - if(a.type&s) -{ - for(set::const_iterator i=s.begin();i!=s.end();++i) - cout << *i << ' '; - return out; -} - -bool makeOneMoveSwap(const alignment&a,const alignment&b,set&oms); - -template -int collectCountsOverNeighborhood(const Vector*,LogProb> >&smsc, - Vector&es, - Vector&fs,tmodel&tTable, - amodel&aCountTable,amodel&dCountTable, - nmodel&nCountTable,double&p1count,double&p0count, - LogProb&_total,float count,bool addCounts,MODEL*d4Table=0); - -#endif diff --git a/ext/giza-pp/GIZA++-v2/defs.h b/ext/giza-pp/GIZA++-v2/defs.h deleted file mode 100644 index e94adddd..00000000 --- a/ext/giza-pp/GIZA++-v2/defs.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef _defs_h -#define _defs_h 1 -#include -#include -#include - -const int TRANSFER_SIMPLE=1; -const int TRANSFER=0; - -const unsigned int MAX_SENTENCE_LENGTH_ALLOWED=101; -const int TRAIN_BUFFER_SIZE= 50000; -//#ifdef WORDINDEX_WITH_4_BYTE -typedef unsigned int WordIndex; -const unsigned int MAX_VOCAB_SIZE=UINT_MAX; -typedef unsigned int PositionIndex; -//#else -//typedef unsigned short WordIndex; -//const unsigned int MAX_VOCAB_SIZE=USHRT_MAX; -//typedef unsigned short PositionIndex; -//#endif -extern WordIndex MAX_FERTILITY; - -const int MAX_W=457979; -extern double LAMBDA; // Lambda that is used to scale cross_entropy factor - -typedef float PROB ; -typedef float COUNT ; - -class LogProb { - private: - double x ; - public: - LogProb():x(0){} - LogProb(double y):x(y){} - LogProb(float y):x(y){} - LogProb(int y):x(y){} - LogProb(WordIndex y):x(y){} - operator double() const {return x;} - LogProb operator *= (double y) { x *= y ; return *this;} - LogProb operator *= (LogProb y) { x *= y.x ; return *this;} - LogProb operator /= (double y) { x /= y ; return *this;} - LogProb operator /= (LogProb y) { x /= y.x ; return *this;} - LogProb operator += (double y) { x += y ; return *this;} - LogProb operator += (LogProb y) { x += y.x ; return *this;} -}; - -const int PARLEV_ITER=1; -const int PARLEV_OPTHEUR=2; -const int PARLEV_OUTPUT=3; -const int PARLEV_SMOOTH=4; -const int PARLEV_EM=5; -const int PARLEV_MODELS=6; -const int PARLEV_SPECIAL=7; -const int PARLEV_INPUT=8; - -#endif - diff --git a/ext/giza-pp/GIZA++-v2/dependencies b/ext/giza-pp/GIZA++-v2/dependencies deleted file mode 100644 index 682ff2d7..00000000 --- a/ext/giza-pp/GIZA++-v2/dependencies +++ /dev/null @@ -1,635 +0,0 @@ -#Automatically generated dependecy list -optimized/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h -optimized/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \ - ATables.h Array4.h TTables.h Globals.h alignment.h -optimized/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h -optimized/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \ - collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \ - transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h -optimized/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h -optimized/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \ - Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \ - FlexArray.h -optimized/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h -optimized/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \ - TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \ - model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \ - FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \ - HMMTables.cpp -optimized/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \ - mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \ - Pointer.h -optimized/logprob.o: logprob.cpp logprob.h -optimized/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \ - Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \ - alignment.h transpair_model3.h NTables.h transpair_model2.h \ - transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \ - WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \ - file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \ - transpair_model4.h transpair_model5.h -optimized/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \ - Dictionary.h utility.h Parameter.h Pointer.h -optimized/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \ - getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \ - Pointer.h -optimized/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \ - Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \ - AlignTables.h utility.h -optimized/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \ - transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \ - Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \ - model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \ - D4Tables.h AlignTables.h collCounts.h transpair_model4.h -optimized/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \ - Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \ - AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \ - transpair_model5.h Parameter.h Pointer.h -optimized/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \ - transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \ - Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \ - model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \ - D4Tables.h AlignTables.h utility.h -optimized/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \ - myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \ - alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \ - getSentence.h model2.h model1.h Perplexity.h Dictionary.h \ - transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \ - HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \ - transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \ - Pointer.h collCounts.cpp -optimized/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \ - mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \ - WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h HMMTables.h FlexArray.h -optimized/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h -optimized/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \ - Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h -optimized/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \ - Array2.h Pointer.h Globals.h defs.h Vector.h -optimized/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \ - D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \ - Pointer.h -optimized/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h Globals.h -optimized/plain2snt.o: plain2snt.cpp -optimized/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \ - Pointer.h -optimized/snt2cooc.o: snt2cooc.cpp -optimized/snt2plain.o: snt2plain.cpp -optimized/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h transpair_model2.h \ - transpair_model1.h -optimized/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \ - transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \ - Pointer.h -optimized/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \ - WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \ - transpair_model1.h Parameter.h Pointer.h -optimized/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h -optimized/utility.o: utility.cpp mymath.h -optimized/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h -#Automatically generated dependecy list -debug/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h -debug/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \ - ATables.h Array4.h TTables.h Globals.h alignment.h -debug/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h -debug/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \ - collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \ - transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h -debug/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h -debug/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \ - Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \ - FlexArray.h -debug/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h -debug/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \ - TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \ - model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \ - FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \ - HMMTables.cpp -debug/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \ - mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \ - Pointer.h -debug/logprob.o: logprob.cpp logprob.h -debug/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \ - Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \ - alignment.h transpair_model3.h NTables.h transpair_model2.h \ - transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \ - WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \ - file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \ - transpair_model4.h transpair_model5.h -debug/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \ - Dictionary.h utility.h Parameter.h Pointer.h -debug/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \ - getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \ - Pointer.h -debug/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \ - Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \ - AlignTables.h utility.h -debug/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \ - transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \ - Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \ - model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \ - D4Tables.h AlignTables.h collCounts.h transpair_model4.h -debug/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \ - Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \ - AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \ - transpair_model5.h Parameter.h Pointer.h -debug/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \ - transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \ - Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \ - model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \ - D4Tables.h AlignTables.h utility.h -debug/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \ - myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \ - alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \ - getSentence.h model2.h model1.h Perplexity.h Dictionary.h \ - transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \ - HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \ - transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \ - Pointer.h collCounts.cpp -debug/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \ - mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \ - WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h HMMTables.h FlexArray.h -debug/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h -debug/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \ - Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h -debug/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \ - Array2.h Pointer.h Globals.h defs.h Vector.h -debug/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \ - D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \ - Pointer.h -debug/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h Globals.h -debug/plain2snt.o: plain2snt.cpp -debug/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \ - Pointer.h -debug/snt2cooc.o: snt2cooc.cpp -debug/snt2plain.o: snt2plain.cpp -debug/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h transpair_model2.h \ - transpair_model1.h -debug/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \ - transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \ - Pointer.h -debug/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \ - WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \ - transpair_model1.h Parameter.h Pointer.h -debug/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h -debug/utility.o: utility.cpp mymath.h -debug/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h -#Automatically generated dependecy list -vdebug/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h -vdebug/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \ - ATables.h Array4.h TTables.h Globals.h alignment.h -vdebug/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h -vdebug/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \ - collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \ - transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h -vdebug/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h -vdebug/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \ - Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \ - FlexArray.h -vdebug/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h -vdebug/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \ - TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \ - model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \ - FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \ - HMMTables.cpp -vdebug/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \ - mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \ - Pointer.h -vdebug/logprob.o: logprob.cpp logprob.h -vdebug/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \ - Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \ - alignment.h transpair_model3.h NTables.h transpair_model2.h \ - transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \ - WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \ - file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \ - transpair_model4.h transpair_model5.h -vdebug/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \ - Dictionary.h utility.h Parameter.h Pointer.h -vdebug/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \ - getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \ - Pointer.h -vdebug/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \ - Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \ - AlignTables.h utility.h -vdebug/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \ - transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \ - Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \ - model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \ - D4Tables.h AlignTables.h collCounts.h transpair_model4.h -vdebug/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \ - Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \ - AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \ - transpair_model5.h Parameter.h Pointer.h -vdebug/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \ - transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \ - Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \ - model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \ - D4Tables.h AlignTables.h utility.h -vdebug/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \ - myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \ - alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \ - getSentence.h model2.h model1.h Perplexity.h Dictionary.h \ - transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \ - HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \ - transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \ - Pointer.h collCounts.cpp -vdebug/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \ - mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \ - WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h HMMTables.h FlexArray.h -vdebug/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h -vdebug/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \ - Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h -vdebug/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \ - Array2.h Pointer.h Globals.h defs.h Vector.h -vdebug/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \ - D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \ - Pointer.h -vdebug/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h Globals.h -vdebug/plain2snt.o: plain2snt.cpp -vdebug/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \ - Pointer.h -vdebug/snt2cooc.o: snt2cooc.cpp -vdebug/snt2plain.o: snt2plain.cpp -vdebug/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h transpair_model2.h \ - transpair_model1.h -vdebug/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \ - transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \ - Pointer.h -vdebug/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \ - WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \ - transpair_model1.h Parameter.h Pointer.h -vdebug/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h -vdebug/utility.o: utility.cpp mymath.h -vdebug/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h -#Automatically generated dependecy list -norm/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h -norm/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \ - ATables.h Array4.h TTables.h Globals.h alignment.h -norm/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h -norm/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \ - collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \ - transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h -norm/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h -norm/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \ - Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \ - FlexArray.h -norm/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h -norm/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \ - TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \ - model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \ - FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \ - HMMTables.cpp -norm/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \ - mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \ - Pointer.h -norm/logprob.o: logprob.cpp logprob.h -norm/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \ - Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \ - alignment.h transpair_model3.h NTables.h transpair_model2.h \ - transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \ - WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \ - file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \ - transpair_model4.h transpair_model5.h -norm/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \ - Dictionary.h utility.h Parameter.h Pointer.h -norm/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \ - getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \ - Pointer.h -norm/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \ - Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \ - AlignTables.h utility.h -norm/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \ - transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \ - Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \ - model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \ - D4Tables.h AlignTables.h collCounts.h transpair_model4.h -norm/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \ - Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \ - AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \ - transpair_model5.h Parameter.h Pointer.h -norm/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \ - transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \ - Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \ - model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \ - D4Tables.h AlignTables.h utility.h -norm/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \ - myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \ - alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \ - getSentence.h model2.h model1.h Perplexity.h Dictionary.h \ - transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \ - HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \ - transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \ - Pointer.h collCounts.cpp -norm/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \ - mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \ - WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h HMMTables.h FlexArray.h -norm/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h -norm/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \ - Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h -norm/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \ - Array2.h Pointer.h Globals.h defs.h Vector.h -norm/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \ - D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \ - Pointer.h -norm/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h Globals.h -norm/plain2snt.o: plain2snt.cpp -norm/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \ - Pointer.h -norm/snt2cooc.o: snt2cooc.cpp -norm/snt2plain.o: snt2plain.cpp -norm/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h transpair_model2.h \ - transpair_model1.h -norm/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \ - transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \ - Pointer.h -norm/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \ - WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \ - transpair_model1.h Parameter.h Pointer.h -norm/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h -norm/utility.o: utility.cpp mymath.h -norm/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h -#Automatically generated dependecy list -profile/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h -profile/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \ - ATables.h Array4.h TTables.h Globals.h alignment.h -profile/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h -profile/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \ - collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \ - transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h -profile/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h -profile/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \ - Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \ - FlexArray.h -profile/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h -profile/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \ - TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \ - model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \ - FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \ - HMMTables.cpp -profile/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \ - mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \ - Pointer.h -profile/logprob.o: logprob.cpp logprob.h -profile/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \ - Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \ - alignment.h transpair_model3.h NTables.h transpair_model2.h \ - transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \ - WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \ - file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \ - transpair_model4.h transpair_model5.h -profile/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \ - Dictionary.h utility.h Parameter.h Pointer.h -profile/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \ - getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \ - Pointer.h -profile/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \ - Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \ - AlignTables.h utility.h -profile/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \ - transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \ - Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \ - model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \ - D4Tables.h AlignTables.h collCounts.h transpair_model4.h -profile/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \ - Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \ - AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \ - transpair_model5.h Parameter.h Pointer.h -profile/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \ - transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \ - Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \ - model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \ - D4Tables.h AlignTables.h utility.h -profile/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \ - myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \ - alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \ - getSentence.h model2.h model1.h Perplexity.h Dictionary.h \ - transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \ - HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \ - transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \ - Pointer.h collCounts.cpp -profile/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \ - mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \ - NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \ - transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \ - WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \ - ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \ - Perplexity.h Dictionary.h HMMTables.h FlexArray.h -profile/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h -profile/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \ - Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h -profile/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \ - Array2.h Pointer.h Globals.h defs.h Vector.h -profile/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \ - D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \ - Pointer.h -profile/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \ - mymath.h Array2.h defs.h Globals.h -profile/plain2snt.o: plain2snt.cpp -profile/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \ - Pointer.h -profile/snt2cooc.o: snt2cooc.cpp -profile/snt2plain.o: snt2plain.cpp -profile/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h transpair_model2.h \ - transpair_model1.h -profile/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \ - transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \ - Pointer.h -profile/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \ - mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \ - Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \ - WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \ - transpair_model1.h Parameter.h Pointer.h -profile/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \ - myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h -profile/utility.o: utility.cpp mymath.h -profile/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \ - Array2.h diff --git a/ext/giza-pp/GIZA++-v2/file_spec.h b/ext/giza-pp/GIZA++-v2/file_spec.h deleted file mode 100644 index 2d432958..00000000 --- a/ext/giza-pp/GIZA++-v2/file_spec.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef FILE_SPEC_H -#define FILE_SPEC_H - -#include -#include -#include -#include - -/* This function returns a string, locally called file_spec. This - string is the concatenation of the date and time of execution - and the user who is performing the execution */ -/* Originally implemented in C by Yaser Al-Onaizan; - editions for C++ and formatting by Noah A. Smith, 9 July 1999 */ - -char *Get_File_Spec (){ - struct tm *local; - time_t t; - char *user; - char time_stmp[19]; - char *file_spec = 0; - - t = time(NULL); - local = localtime(&t); - - sprintf(time_stmp, "%02d-%02d-%02d.%02d%02d%02d.", local->tm_year, - (local->tm_mon + 1), local->tm_mday, local->tm_hour, - local->tm_min, local->tm_sec); - user = getenv("USER"); - - file_spec = (char *)malloc(sizeof(char) * - (strlen(time_stmp) + strlen(user) + 1)); - file_spec[0] = '\0'; - strcat(file_spec, time_stmp) ; - strcat(file_spec, user); - return file_spec; -} - -#endif diff --git a/ext/giza-pp/GIZA++-v2/getSentence.cpp b/ext/giza-pp/GIZA++-v2/getSentence.cpp deleted file mode 100644 index 125a6f7c..00000000 --- a/ext/giza-pp/GIZA++-v2/getSentence.cpp +++ /dev/null @@ -1,340 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/* --------------------------------------------------------------------------* - * * - * Module : getSentece * - * * - * Method Definitions File: getSentence.cc * - * * - * Objective: Defines clases and methods for handling I/O for the parallel * - * corpus. * - *****************************************************************************/ - - -#include "getSentence.h" -#include -#include -#include "Parameter.h" -#include "errno.h" - -int PrintedTooLong=0; - -/* -------------- Method Defnitions for Class sentenceHandler ---------------*/ - -GLOBAL_PARAMETER(double,ManlexMAX_MULTIPLICITY,"manlexMAX_MULTIPLICITY","",PARLEV_EM,20.0); -GLOBAL_PARAMETER(double,Manlexfactor1,"manlexfactor1","",PARLEV_EM,0.0); -GLOBAL_PARAMETER(double,Manlexfactor2,"manlexfactor2","",PARLEV_EM,0.0); - -sentenceHandler::sentenceHandler(const char* filename, vcbList* elist, - vcbList* flist) : realCount(0) - // This method is the constructor of the class, it also intitializes the - // sentence pair sequential number (count) to zero. - -{ - readflag = false ; - allInMemory = false ; - inputFilename = filename ; - inputFile = new ifstream(filename); - pair_no = 0 ; - if(!(*inputFile)){ - cerr << "\nERROR:(a) Cannot open " << filename; - exit(1); - } - currentSentence = 0; - totalPairs1 = 0 ; - totalPairs2 =0; - pair_no = 0 ; - noSentInBuffer = 0 ; - Buffer.clear(); - bool isNegative=0; - if (elist && flist){ - cout << "Calculating vocabulary frequencies from corpus " << filename << '\n'; - sentPair s ; - while (getNextSentence(s, elist, flist)) - { - totalPairs1++; - totalPairs2+=s.realCount; - // NOTE: this value might change during training - // for words from the manual dictionary, yet this is ignored! - - if( s.noOcc<0 ) - isNegative=1; - } - } - if( isNegative==1 ) - { - cerr << "WARNING: corpus contains negative occurrency frequencies => these are interpreted as entries of a manual dictionary.\n"; - realCount=new Vector(totalPairs1,1.0); - } - else - realCount=0; -} - -void sentenceHandler::rewind() -{ - currentSentence = 0; - readflag = false ; - if (!allInMemory || - !(Buffer.size() >= 1 && Buffer[currentSentence].sentenceNo == 1)){ - // check if the buffer doe not already has the first chunk of pairs - if (Buffer.size() > 0) - cerr << ' ' << Buffer[currentSentence].sentenceNo << '\n'; - // totalPairs = 0 ; - pair_no = 0 ; - noSentInBuffer = 0 ; - Buffer.clear(); - } - if (!allInMemory){ - delete inputFile; - inputFile = new ifstream(inputFilename); - if(!(*inputFile)){ - cerr << "\nERROR:(b) Cannot open " << inputFilename << " " << (int)errno; - } - } -} - - -bool sentenceHandler::getNextSentence(sentPair& sent, vcbList* elist, vcbList* flist) -{ - sentPair s ; - if (readflag){ - cerr << "Attempting to read from the end of corpus, rewinding\n"; - rewind(); - return(false); - } - if (currentSentence >= noSentInBuffer){ - if (allInMemory) - return(false); - /* no more sentences in buffer */ - noSentInBuffer = 0 ; - currentSentence = 0 ; - Buffer.clear(); - cout << "Reading more sentence pairs into memory ... \n"; - while((noSentInBuffer < TRAIN_BUFFER_SIZE) && readNextSentence(s)){ - if ((s.fSent.size()-1) > (MAX_FERTILITY-1) * (s.eSent.size()-1)){ - cerr << "WARNING: The following sentence pair has source/target sentence length ration more than\n"<< - "the maximum allowed limit for a source word fertility\n"<< - " source length = " << s.eSent.size()-1 << " target length = " << s.fSent.size()-1 << - " ratio " << double(s.fSent.size()-1)/ (s.eSent.size()-1) << " ferility limit : " << - MAX_FERTILITY-1 << '\n'; - cerr << "Shortening sentence \n"; - cerr << s; - s.eSent.resize(min(s.eSent.size(),s.fSent.size())); - s.fSent.resize(min(s.eSent.size(),s.fSent.size())); - } - Buffer.push_back(s) ; - if (elist && flist){ - if ((*elist).size() > 0) - for (WordIndex i= 0 ; i < s.eSent.size() ; i++){ - if (s.eSent[i] >= (*elist).uniqTokens()){ - if( PrintedTooLong++<100) - cerr << "ERROR: source word " << s.eSent[i] << " is not in the vocabulary list \n"; - exit(-1); - } - (*elist).incFreq(s.eSent[i], s.realCount); - } - if ((*flist).size() > 0) - for (WordIndex j= 1 ; j < s.fSent.size() ; j++){ - if (s.fSent[j] >= (*flist).uniqTokens()){ - cerr << "ERROR: target word " << s.fSent[j] << " is not in the vocabulary list \n"; - exit(-1); - } - (*flist).incFreq(s.fSent[j], s.realCount); - } - } - noSentInBuffer++; - } - if (inputFile->eof()){ - allInMemory = (Buffer.size() >= 1 && - Buffer[currentSentence].sentenceNo == 1) ; - if (allInMemory) - cout << "Corpus fits in memory, corpus has: " << Buffer.size() << - " sentence pairs.\n"; - } - } - if(noSentInBuffer <= 0 ){ - //cerr << "# sent in buffer " << noSentInBuffer << '\n'; - readflag = true ; - return(false); - } - sent = Buffer[currentSentence++] ; - if( sent.noOcc<0 && realCount ) - { - if( Manlexfactor1 && sent.noOcc==-1.0 ) - sent.realCount=Manlexfactor1; - else if( Manlexfactor2 && sent.noOcc==-2.0 ) - sent.realCount=Manlexfactor2; - else - sent.realCount=(*realCount)[sent.getSentenceNo()-1]; - } - return true ; -} -bool sentenceHandler::readNextSentence(sentPair& sent) - /* This method reads in a new pair of sentences, each pair is read from the - corpus file as line triples. The first line the no of times this line - pair occured in the corpus, the second line is the source sentence and - the third is the target sentence. The sentences are represented by a space - separated positive integer token ids. */ -{ - - string line; - bool fail(false) ; - - sent.clear(); - if (getline(*inputFile, line)){ - istringstream buffer(line); - buffer >> sent.noOcc; - if( sent.noOcc<0 ) - { - if( realCount ) - { - if( Manlexfactor1 && sent.noOcc==-1.0 ) - sent.realCount=Manlexfactor1; - else if( Manlexfactor2 && sent.noOcc==-2.0 ) - sent.realCount=Manlexfactor2; - else - { - sent.realCount=(*realCount)[pair_no]; - } - } - else - sent.realCount=1.0; - } - else - sent.realCount=sent.noOcc; - } - else { - fail = true ;; - } - if (getline(*inputFile, line)){ - istringstream buffer(line); - WordIndex w; // w is a local variabe for token id - sent.eSent.push_back(0); // each source word is assumed to have 0 == - // a null word (id 0) at the begining of the sentence. - while(buffer>>w){ // read source sentece , word by word . - if (sent.eSent.size() < MAX_SENTENCE_LENGTH) - sent.eSent.push_back(w); - else { - if( PrintedTooLong++<100) - cerr << "{WARNING:(a)truncated sentence "<>w){ // read target sentece , word by word . - if (sent.fSent.size() < MAX_SENTENCE_LENGTH) - sent.fSent.push_back(w); - else { - if( PrintedTooLong++<100) - cerr << "{WARNING:(b)truncated sentence "<&vd) -{ - Vector l; - for(double lambda=1.0;lambda0&&(oldPairs.back().get_eSent()!=s.get_eSent()||oldPairs.back().getSentenceNo()>=s.getSentenceNo()) ) - { - double lambda=optimize_lambda(oldProbs); - for(unsigned int i=0;i -#include -#include -#include "Vector.h" -#include "defs.h" -#include "vocab.h" -#include "Globals.h" -/*----------------------- Class Prototype Definition ------------------------* - Class Name: sentenceHandleer - Objective: This class is defined to handle training sentece pairs from the - parallel corpus. Each pair has: a target sentece, called here French; a - source sentece, called here English sentece; and an integer number denoting - the number of times this pair occured in trining corpus. Both source and - target senteces are represented as integer vector (variable size arrays), - each entry is a numeric value which is the token id for the particular token - in the sentece. - - *---------------------------------------------------------------------------*/ - -class sentPair{ - public: - int sentenceNo ; - float noOcc; - float realCount; - Vector eSent ; - Vector fSent; - - public: - sentPair(){}; - void clear(){ eSent.clear(); fSent.clear(); noOcc=0; realCount=0; sentenceNo=0;}; - const Vector&get_eSent()const - { return eSent; } - const Vector&get_fSent()const - { return fSent; } - int getSentenceNo()const - { return sentenceNo; } - double getCount()const - { return realCount; } -}; - -inline ostream&operator<<(ostream&of,const sentPair&s) -{ - of << "Sent No: " << s.sentenceNo << " , No. Occurrences: " << s.noOcc << '\n'; - if( s.noOcc!=s.realCount ) - of << " Used No. Occurrences: " << s.realCount << '\n'; - unsigned int i; - for(i=0; i < s.eSent.size(); i++) - of << s.eSent[i] << ' '; - of << '\n'; - for(i=1; i < s.fSent.size(); i++) - of << s.fSent[i] << ' '; - of << '\n'; - return of; -} - -class sentenceHandler{ -public: - const char * inputFilename; // parallel corpus file name, similar for all - // sentence pair objects - ifstream *inputFile; // parallel corpus file handler - Vector Buffer; - int noSentInBuffer ; - int currentSentence ; - int totalPairs1 ; - double totalPairs2; - bool readflag ; // true if you reach the end of file - bool allInMemory ; - int pair_no ; - Vector *realCount; - - Vector oldPairs; - Vector oldProbs; - sentenceHandler(const char* filename, vcbList* elist=0, vcbList* flist=0); - void rewind(); - bool getNextSentence(sentPair&, vcbList* = 0, vcbList* = 0); // will be defined in the definition file, this - int getTotalNoPairs1()const {return totalPairs1;}; - double getTotalNoPairs2()const {return totalPairs2;}; - // method will read the next pair of sentence from memory buffer - bool readNextSentence(sentPair&); // will be defined in the definition file, this - void setProbOfSentence(const sentPair&s,double d); -}; - -#endif - diff --git a/ext/giza-pp/GIZA++-v2/hmm.cpp b/ext/giza-pp/GIZA++-v2/hmm.cpp deleted file mode 100644 index fc4284c6..00000000 --- a/ext/giza-pp/GIZA++-v2/hmm.cpp +++ /dev/null @@ -1,405 +0,0 @@ -/* - -Copyright (C) 1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "hmm.h" -#include "Globals.h" -#include "utility.h" -#include "HMMTables.h" -#include "ForwardBackward.h" -#include "Parameter.h" - -#define CLASSIFY(i,empty,ianf) bool empty=(i>=l); unsigned int ianf=(i%l); -#define CLASSIFY2(i,ianf) unsigned int ianf=(i%l); - - -short PredictionInAlignments=0; -short UniformEntryExit=3; -short HMMTrainingSpecialFlags=0; - -GLOBAL_PARAMETER2(int,ModelH_Dump_Freq,"HMM DUMP FREQUENCY","th","dump frequency of HMM",PARLEV_OUTPUT,0); - -GLOBAL_PARAMETER(short,CompareAlDeps,"emAlignmentDependencies", - "lextrain: dependencies in the HMM alignment model. " - " &1: sentence length; &2: previous class; &4: previous position; " - " &8: French position; &16: French class" - ,PARLEV_MODELS,2); -GLOBAL_PARAMETER(double,GLOBALProbabilityForEmpty,"emProbForEmpty", - "f-b-trn: probability for empty word",PARLEV_MODELS,0.4); -GLOBAL_PARAMETER(short,SmoothHMM,"emSmoothHMM", - "f-b-trn: smooth HMM model &1: modified counts; &2:perform smoothing with -emAlSmooth",PARLEV_SPECIAL,2); -GLOBAL_PARAMETER(double,HMMAlignmentModelSmoothFactor,"emAlSmooth", - "f-b-trn: smoothing factor for HMM alignment model (can be ignored by -emSmoothHMM)",PARLEV_SMOOTH,0.2); - - -/*template -void smooth_standard(T*a,T*b,double p) -{ - int n=b-a; - if( n==0 ) - return; - double pp=p/n; - for(T*i=a;i!=b;++i) - *i = (1.0-p)*(*i)+pp; -}*/ - - -hmm::hmm(model2& m) - : model2(m),counts(GLOBALProbabilityForEmpty,ewordclasses,fwordclasses), - probs(GLOBALProbabilityForEmpty,ewordclasses,fwordclasses) -{ } - -void hmm::initialize_table_uniformly(sentenceHandler&){} - -int hmm::em_with_tricks(int noIterations) -{ - double minErrors=1.0;int minIter=0; - string modelName="Hmm",shortModelName="hmm"; - int dumpFreq=ModelH_Dump_Freq; - time_t it_st, st, it_fn, fn; - string tfile, afile,afileh, number, alignfile, test_alignfile; - int pair_no = 0; - bool dump_files = false ; - ofstream of2 ; - st = time(NULL) ; - sHandler1.rewind(); - cout << "\n==========================================================\n"; - cout << modelName << " Training Started at: " << ctime(&st); - for(int it=1; it <= noIterations ; it++){ - pair_no = 0; - it_st = time(NULL) ; - cout << endl << "-----------\n" << modelName << ": Iteration " << it << '\n'; - dump_files = (dumpFreq != 0) && ((it % dumpFreq) == 0) && !NODUMPS; - number = ""; - int n = it; - do{ - number.insert((size_t)0, 1, (char)(n % 10 + '0')); - } while((n /= 10) > 0); - tfile = Prefix + ".t" + shortModelName + "." + number ; - afile = Prefix + ".a" + shortModelName + "." + number ; - afileh = Prefix + ".h" + shortModelName + "." + number ; - alignfile = Prefix + ".A" + shortModelName + "." + number ; - test_alignfile = Prefix + ".tst.A" + shortModelName + "." + number ; - counts=HMMTables(GLOBALProbabilityForEmpty,ewordclasses,fwordclasses); - aCountTable.clear(); - initAL(); - em_loop(perp, sHandler1, dump_files , alignfile.c_str(), trainViterbiPerp, false,it==1,it); - if( errorsAL()cross_entropy() - << " PERPLEXITY " << testViterbiPerp->perplexity() - << '\n'; - if (dump_files){ - if( OutputInAachenFormat==0) - tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat); - ofstream afilestream(afileh.c_str()); - probs.writeJumps(afilestream); - aCountTable.printTable(afile.c_str()); - } - it_fn = time(NULL) ; - cout << "\n" << modelName << " Iteration: " << it<< " took: " << - difftime(it_fn, it_st) << " seconds\n"; - } // end of iterations - fn = time(NULL) ; - cout << endl << "Entire " << modelName << " Training took: " << difftime(fn, st) << " seconds\n"; - //cout << "tTable contains " << tTable.getHash().bucket_count() - // << " buckets and " << tTable.getHash().size() << " entries." ; - cout << "==========================================================\n"; - return minIter; -} - -/*template -T normalize_if_possible_with_increment(T*a,T*b,int increment) -{ - T sum=0; - for(T*i=a;i!=b;i+=increment) - sum+=*i; - if( sum ) - for(T*i=a;i!=b;i+=increment) - *i/=sum; - else - { - T factor=increment/(b-a); - for(T*i=a;i!=b;i+=increment) - *i=factor; - } - return sum; -}*/ - -void hmm::load_table(const char* aname){ - cout << "Hmm: loading a table not implemented.\n"; - abort(); - ifstream anamefile(aname); - probs.readJumps(anamefile); -} - -HMMNetwork *hmm::makeHMMNetwork(const Vector& es,const Vector&fs,bool doInit)const -{ - unsigned int i,j; - unsigned int l = es.size() - 1; - unsigned int m = fs.size() - 1; - unsigned int I=2*l,J=m; - int IJ=I*J; - bool DependencyOfJ=(CompareAlDeps&(16|8))||(PredictionInAlignments==2); - bool DependencyOfPrevAJ=(CompareAlDeps&(2|4))||(PredictionInAlignments==0); - HMMNetwork *net = new HMMNetwork(I,J); - fill(net->alphainit.begin(),net->alphainit.end(),0.0); - fill(net->betainit.begin(),net->betainit.end(),0.0); - for(j=1;j<=m;j++) - { - for(i=1;i<=l;i++) - net->n(i-1,j-1)=tTable.getProb(es[i], fs[j]) ; - double emptyContribution=0; - emptyContribution=tTable.getProb(es[0],fs[j]) ; - for(i=1;i<=l;i++) - net->n(i+l-1,j-1)=emptyContribution; - net->finalMultiply*=max(normalize_if_possible_with_increment(&net->n(0,j-1),&net->n(0,j-1)+IJ,J),double(1e-12)); - } - if( DependencyOfJ ) - net->e.resize(m-1); - else - net->e.resize(J>1); - for(j=0;je.size();j++) - { - int frenchClass=fwordclasses.getClass(fs[1+min(int(m)-1,int(j)+1)]); - net->e[j].resize(I,I,0); - for(unsigned int i1=0;i1 al(l); - CLASSIFY2(i1,i1real); - for(unsigned int i2=0;i2(al.begin()),conv(al.end())); - if( SmoothHMM&2 ) - smooth_standard(conv(al.begin()),conv(al.end()),HMMAlignmentModelSmoothFactor); - for(unsigned int i2=0;i2e[j](i1,i2) = al[i2real]; - - if( empty_i2 ) - if(i1real!=i2real) - { - net->e[j](i1,i2)=0; - } - else - { - net->e[j](i1,i2)=doInit?al[0]:(probs.getProbabilityForEmpty()); // make first HMM iteration like IBM-1 - } - } - normalize_if_possible(&net->e[j](i1,0),&net->e[j](i1,0)+I); - } - } - if( doInit ) - { - for(unsigned int i=0;ialphainit[i]=net->betainit[i]=(ibetainit[i]=1.0; - } - } - else - { - if( DependencyOfPrevAJ==0 ) - { - for(i=0;ialphainit[i]=probs.getAlProb(-1,ireal,l,m,0,fwordclasses.getClass(fs[1+0]),0); - } - } - else - { - if( UniformEntryExit&2 )probs.getBetaInit(I,net->betainit); - if( UniformEntryExit&1 )probs.getAlphaInit(I,net->alphainit); - } - } - massert( net->alphainit.size()==I );massert( net->betainit.size()==I ); - normalize_if_possible(conv(net->alphainit.begin()),conv(net->alphainit.end())); - normalize_if_possible(conv(net->betainit.begin()),conv(net->betainit.end())); - transform(net->betainit.begin(),net->betainit.end(),net->betainit.begin(),bind1st(multiplies(),2*l)); - return net; -} -extern float MINCOUNTINCREASE; - -void hmm::em_loop(Perplexity& perp, sentenceHandler& sHandler1, - bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp, - bool test,bool doInit,int -) -{ - WordIndex i, j, l, m ; - double cross_entropy; - int pair_no=0 ; - perp.clear(); - viterbi_perp.clear(); - ofstream of2; - // for each sentence pair in the corpus - if (dump_alignment||FEWDUMPS ) - of2.open(alignfile); - sentPair sent ; - sHandler1.rewind(); - while(sHandler1.getNextSentence(sent)){ - const Vector& es = sent.get_eSent(); - const Vector& fs = sent.get_fSent(); - const float so = sent.getCount(); - l = es.size() - 1; - m = fs.size() - 1; - cross_entropy = log(1.0); - Vector viterbi_alignment(fs.size()); - - unsigned int I=2*l,J=m; - bool DependencyOfJ=(CompareAlDeps&(16|8))||(PredictionInAlignments==2); - bool DependencyOfPrevAJ=(CompareAlDeps&(2|4))||(PredictionInAlignments==0); - HMMNetwork *net=makeHMMNetwork(es,fs,doInit); - Array gamma; - Array > epsilon(DependencyOfJ?(m-1):1); - double trainProb; - trainProb=ForwardBackwardTraining(*net,gamma,epsilon); - if( !test ) - { - double *gp=conv(gamma.begin()); - for(unsigned int i2=0;i2MINCOUNTINCREASE ) - { - COUNT add= *gp*so; - if( i1>=l ) - { - tTable.incCount(es[0],fs[1+i2],add); - aCountTable.getRef(0,i2+1,l,m)+=add; - } - else - { - tTable.incCount(es[1+i1],fs[1+i2],add); - aCountTable.getRef(1+i1,1+i2,l,m)+=add; - } - } - double p0c=0.0,np0c=0.0; - for(unsigned int jj=0;jj(gamma.begin()),*gp2=conv(gamma.end())-I; - Array&ai=counts.doGetAlphaInit(I); - Array&bi=counts.doGetBetaInit(I); - int firstFrenchClass=(fs.size()>1)?(fwordclasses.getClass(fs[1+0])):0; - for(i=0;ifinalMultiply,1e-100)); - Arrayvit; - double viterbi_score=1.0; - if( (HMMTrainingSpecialFlags&1) ) - HMMViterbi(*net,gamma,vit); - else - viterbi_score=HMMRealViterbi(*net,vit); - for(j=1;j<=m;j++) - { - viterbi_alignment[j]=vit[j-1]+1; - if( viterbi_alignment[j]>l) - viterbi_alignment[j]=0; - } - sHandler1.setProbOfSentence(sent,cross_entropy); - perp.addFactor(cross_entropy, so, l, m,1); - viterbi_perp.addFactor(log(viterbi_score)+log(max(net->finalMultiply,1e-100)), so, l, m,1); - if( Verbose ) - cout << "Viterbi-perp: " << log(viterbi_score) << ' ' << log(max(net->finalMultiply,1e-100)) << ' ' << viterbi_score << ' ' << net->finalMultiply << ' ' << *net << "gamma: " << gamma << endl; - delete net;net=0; - if (dump_alignment||(FEWDUMPS&&sent.getSentenceNo()<1000) ) - printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.getSentenceNo(), viterbi_score); - addAL(viterbi_alignment,sent.getSentenceNo(),l); - pair_no++; - } /* of while */ - sHandler1.rewind(); - perp.record("HMM"); - viterbi_perp.record("HMM"); - errorReportAL(cout,"HMM"); -} - -#include "HMMTables.cpp" -template class HMMTables; - diff --git a/ext/giza-pp/GIZA++-v2/hmm.h b/ext/giza-pp/GIZA++-v2/hmm.h deleted file mode 100644 index 6909bead..00000000 --- a/ext/giza-pp/GIZA++-v2/hmm.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef _hmm_h -#define _hmm_h 1 - -#include - -#include -#include -#include -#include -#include -#include "Vector.h" -#include - -#include -#include -#include - -#include "TTables.h" -#include "ATables.h" -#include "getSentence.h" -#include "defs.h" -#include "model2.h" -#include "Perplexity.h" -#include "vocab.h" -#include "WordClasses.h" -#include "HMMTables.h" -#include "ForwardBackward.h" - -class hmm : public model2 -{ - private: - WordClasses ewordclasses; - WordClasses fwordclasses; - HMMTables counts,probs; - public: - template - void makeWordClasses(const MAPPER&m1,const MAPPER&m2,string efile,string ffile) - { - ifstream estrm(efile.c_str()),fstrm(ffile.c_str()); - if( !estrm ) - { - cerr << "ERROR: can not read " << efile << endl; - } - else - ewordclasses.read(estrm,m1); - if( !fstrm ) - cerr << "ERROR: can not read " << ffile << endl; - else - fwordclasses.read(fstrm,m2); - } - hmm(model2&m2); - void initialize_table_uniformly(sentenceHandler&); - int em_with_tricks(int); - void load_table(const char* aname); - void em_loop(Perplexity& perp, sentenceHandler& sHandler1, bool dump_files, - const char* alignfile, Perplexity&, bool test,bool doInit,int iter); - HMMNetwork *makeHMMNetwork(const Vector& es,const Vector&fs,bool doInit)const; - friend class model3; -}; - -#endif diff --git a/ext/giza-pp/GIZA++-v2/logprob.cpp b/ext/giza-pp/GIZA++-v2/logprob.cpp deleted file mode 100644 index 9035f80f..00000000 --- a/ext/giza-pp/GIZA++-v2/logprob.cpp +++ /dev/null @@ -1,154 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - -// Routines to perform integer exponential arithmetic. -// A number x is represented as n, where x = b**n. -// It is assumed that b > 1, something like b = 1.001; - -#include "logprob.h" -#include -#include -#include -#include -#include -double *LogProb::ntof = NULL; // Tables will be initialized -int *LogProb::addtbl = NULL; // in Initialize function. -int *LogProb::subtbl = NULL; // - -const int LogProb::max_2byte_integer = 32767; -const int LogProb::min_2byte_integer = -32768; -const double LogProb::b = 1.001; // a logarithm basis -const double LogProb::logb2 = log(b); -//const int LogProb::nmax = round(78.0E0 * log(1.0E1) / logb2); -const int LogProb::nmax = round(300.0E0 * log(1.0E1) / logb2); -const int LogProb::nmin = -nmax; -const int LogProb::tblbnd = round(log((b-1.0E0)/2.0E0)/logb2); -const int LogProb::zeron = round(pow(-2, 23)); -const int LogProb::onen = 0; -const int LogProb::infn = onen - zeron; - -const int LogProb::initialized = LogProb::Initialize(); -const LogProb LogProb::zero(0); -const LogProb LogProb::one(1); -const LogProb LogProb::minus2(1e-2); -const LogProb LogProb::minus4(1e-4); -const LogProb LogProb::minus6(1e-6); -const LogProb LogProb::minus8(1e-8); -const LogProb LogProb::minus10(1e-10); -const LogProb LogProb::minus12(1e-12); -const LogProb LogProb::minus14(1e-14); -const LogProb LogProb::minus16(1e-16); - -// static table initialization function -int LogProb::Initialize() -{ - int nbytes = sizeof(double)*(nmax-nmin+1) + sizeof(int)*(0-tblbnd+1); - std::cerr << nbytes << " bytes used for LogProb tables (C++ version)\n"; - ntof = new double[nmax-nmin+1]; - addtbl = new int[-tblbnd+1]; - subtbl = new int[-tblbnd+1]; - - // char filename[257]; - // string filename ; - // ifstream ifs; - // ifs.open(filename.c_str()); - // if (!ifs) - // { - int i; - std::cerr << "Building integer logs conversion tables\n"; - ntof[0] = 0 ; - - for (i=nmin+1; i<=nmax; ++i) - { - double x = i; - ntof[i-nmin] = exp(x*logb2); - - } - for (i=tblbnd; i<=0; ++i) - { - double x = 1.0 + pow(b, i); - addtbl[i-tblbnd] = round(log(x)/logb2); - } - double sqrtb = exp(0.5*logb2); - for (i=0; i<=-tblbnd; ++i) - { - double x = sqrtb * pow(b, i) - 1.0; - subtbl[i] = round(log(x)/logb2); - } - // if (toolsRoot) - // { - // ofstream ofs(filename.c_str()); - // if (!ofs) - // cerr << "Could not write LogProb data to " << filename << endl; - // else - // { - // ofs.write((const char *)ntof, sizeof(double) * (nmax-nmin+1)); - // ofs.write((const char *)addtbl, sizeof(int) * (-tblbnd+1)); - // ofs.write((const char *)subtbl, sizeof(int) * (-tblbnd+1)); - // } - // } - // } - // else - // { - // ifs.read((char *)ntof, sizeof(double) * (nmax - nmin + 1)); - // ifs.read((char *)addtbl, sizeof(int) * (-tblbnd+1)); - // ifs.read((char *)subtbl, sizeof(int) * (-tblbnd+1)); - // } - return 1; -} - -void LogProb::FreeTables() -{ - delete [] addtbl; - delete [] subtbl; - delete [] ntof; -} - -//--------------------------------------------------------------------------- -// Aritmetic operators -//--------------------------------------------------------------------------- - - -// Subtract two logarithm numbers. Use the following method: -// b**n - b**m = b**m( b**(n-m) - 1 ), assuming n >= m. -LogProb& LogProb::operator-=(const LogProb &subs) -{ - if (subs.logr == zeron) - return *this; - int a = logr - subs.logr; - if (a <= 0) - { - if (a < 0) - { - std::cerr << "WARNING(logprob): Invalid arguments to nsub" <<(*this)<< " " << subs << std::endl; - //abort(); - } - logr = zeron; - return *this; - } - if (a > -tblbnd) - return *this; - logr = subs.logr + subtbl[a]; - return *this; -} - - diff --git a/ext/giza-pp/GIZA++-v2/logprob.h b/ext/giza-pp/GIZA++-v2/logprob.h deleted file mode 100644 index 14696ac8..00000000 --- a/ext/giza-pp/GIZA++-v2/logprob.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef _LOGPROB_H -#define _LOGPROB_H - -// Routines to perform integer exponential arithmetic. -// A number x is represented as n, where x = b**n -// It is assumed that b > 1, something like b = 1.001 - -#include -#include -#include - -//#define MAX(A,B) ((A) > (B) ? (A) : (B)) -//#define MIN(A,B) ((A) > (B) ? (B) : (A)) - - -class LogProb { -public: - // mj for cross entropy - double base2() const { - return (logr * logb2 / log(2)); - } - - // Constructors - LogProb() : logr(zeron) {} - LogProb(const LogProb &obj) : logr(obj.logr) {} - LogProb(double x) : logr(x == 0.0 ? zeron : round(log(x)/logb2)) {} - // destructor - ~LogProb() {} // default destructor - - operator double() const // converts logr to (double) b**logr - { - if (logr < nmin) return ntof[0]; - if (logr > nmax) return ntof[nmax-nmin]; - return ntof[logr-nmin]; - } - - LogProb &operator=(const LogProb &obj) { logr = obj.logr; return *this; } - int operator!() const { return logr == zeron; } - - // iostream friend specifications - friend std::ostream& operator<<(std::ostream& os, const LogProb &obj); - friend std::istream& operator>>(std::istream& is, LogProb &obj); - friend std::ostream& operator<<=(std::ostream& os, const LogProb &obj); - friend std::istream& operator>>=(std::istream& is, LogProb &obj); - - // arithmetic operators - LogProb &operator+=(const LogProb &add) // logr2 = logb ( b**logr2 + b**logr1 ) - // Add two numbers represented as logarithms. Use the following method: - // b**n + b**m = b**n(1 + b**(m-n)), assuming n >= m. - { - if (add.logr == zeron) - return *this; - if (logr == zeron) - { - logr = add.logr; - return *this; - } - int a = add.logr - logr; - if (a > 0) - { - a = -a; - logr = add.logr; - } - if (a < tblbnd) - return *this; - logr += addtbl[a-tblbnd]; - return *this; - } - - LogProb &operator-=(const LogProb &); // logr2 = logb ( b**logr2 + b**logr1 ) - LogProb operator*(const LogProb &mul) const // logr3 = logr2 + logr1 - { - LogProb result; // start out with result == 0 - if ((logr != zeron) && (mul.logr != zeron)) - result.logr = std::max(logr+mul.logr, zeron); - return result; - } - LogProb operator*(double x) const // logr3 = logr2 + logr1 - { - return (*this)*(LogProb)x; - } - LogProb operator^(const int i) const // logr2 = logr1 * i - { - LogProb result; // start out with result == 0 - // if ((logr != zeron) && (mul.logr != zeron)) - result.logr = logr * i ; - return result; - } - LogProb &operator*=(const LogProb &mul) // logr2 += logr1 - { - if ((logr == zeron) || (mul.logr == zeron)) - logr = zeron; - else - logr = std::max(logr+mul.logr, zeron); - return *this; - } - LogProb operator/(const LogProb &div) const // logr3 = logr2 -logr1 - { - LogProb result; - if (logr != zeron) - result.logr = std::max(logr - div.logr, zeron); - return result; - } - LogProb &operator/=(const LogProb &div) // logr2 -= logr1 - { - if (logr != zeron) - logr = std::max(logr - div.logr, zeron); - return *this; - } - LogProb operator+(const LogProb &l) const // logr3 = logb ( b**logr2 + b**logr1 ) - { LogProb result(*this); result += l; return result; } - LogProb operator-(const LogProb &l) const // logr3 = logb ( b**logr2 - b**logr1 ) - { LogProb result(*this); result -= l; return result; } - LogProb power(const int n) const // logr2 = logr1 * int - { LogProb result(*this); result.logr *= n; return result; } - - // Conditional operators - int operator<(const LogProb &obj) const { return logr < obj.logr; } - int operator<=(const LogProb &obj) const { return logr <= obj.logr; } - int operator>(const LogProb &obj) const { return logr > obj.logr; } - int operator>=(const LogProb &obj) const { return logr >= obj.logr; } - int operator==(const LogProb &obj) const { return logr == obj.logr; } - int operator!=(const LogProb &obj) const { return logr != obj.logr; } - int operator<(double d) const { return ((double)*this) < d; } - int operator<=(double d) const { return ((double)*this) <= d; } - int operator>(double d) const { return ((double)*this) > d; } - int operator>=(double d) const { return ((double)*this) >= d; } - int operator==(double d) const { return ((double)*this) == d; } - int operator!=(double d) const { return ((double)*this) != d; } - - - LogProb &SetZero() { logr = zeron; return *this; } // representation of 0, - LogProb &SetOne() { logr = onen; return *this; } // 1, and - LogProb &SetInf() { logr = infn; return *this; } // inf in logarithm domain - -private: - int logr; // a representation of logarithm - // static constants - static const int initialized; // initialization flag - static const double b; - static const double logb2; - static const int nmin, nmax; - static const int tblbnd; - static const int zeron, onen, infn; // zero, one, and inf in log domain - static const int max_2byte_integer, min_2byte_integer; - - // Arithmetic computation Tables - static double *ntof; - static int *addtbl; - static int *subtbl; - - static int Initialize(); - -public: - static void FreeTables(); - // constants for initializing LogProbs to 0 or 1 - static const LogProb zero; - static const LogProb one; - static const LogProb minus2; - static const LogProb minus4; - static const LogProb minus6; - static const LogProb minus8; - static const LogProb minus10; - static const LogProb minus12; - static const LogProb minus14; - static const LogProb minus16; -}; - -// iostream friend operators -inline std::ostream &operator<<(std::ostream& os, const LogProb &obj) -{ - return os << (double) obj; // output in linear domain, b**logr -} - -inline std::istream &operator>>(std::istream& is, LogProb &obj) -{ - double d; - is >> d; - obj = d; - return is; -} - -inline std::ostream &operator<<=(std::ostream& os, const LogProb &obj) // write binary -{ - os.write((const char *)&obj.logr, sizeof(obj.logr)); - return os; -} - -inline std::istream &operator>>=(std::istream& is, LogProb &obj) -{ - is.read((char *)&obj.logr, sizeof(obj.logr)); - return is; -} - -#endif - diff --git a/ext/giza-pp/GIZA++-v2/main.cpp b/ext/giza-pp/GIZA++-v2/main.cpp deleted file mode 100644 index d1b588f3..00000000 --- a/ext/giza-pp/GIZA++-v2/main.cpp +++ /dev/null @@ -1,719 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - -#include -#include "getSentence.h" -#include "TTables.h" -#include "model1.h" -#include "model2.h" -#include "model3.h" -#include "hmm.h" -#include "file_spec.h" -#include "defs.h" -#include "vocab.h" -#include "Perplexity.h" -#include "Dictionary.h" -#include "utility.h" -#include "Parameter.h" -#include "myassert.h" -#include "D4Tables.h" -#include "D5Tables.h" -#include "transpair_model4.h" -#include "transpair_model5.h" - -#define ITER_M2 0 -#define ITER_MH 5 - -GLOBAL_PARAMETER3(int,Model1_Iterations,"Model1_Iterations","NO. ITERATIONS MODEL 1","m1","number of iterations for Model 1",PARLEV_ITER,5); -GLOBAL_PARAMETER3(int,Model2_Iterations,"Model2_Iterations","NO. ITERATIONS MODEL 2","m2","number of iterations for Model 2",PARLEV_ITER,ITER_M2); -GLOBAL_PARAMETER3(int,HMM_Iterations,"HMM_Iterations","mh","number of iterations for HMM alignment model","mh", PARLEV_ITER,ITER_MH); -GLOBAL_PARAMETER3(int,Model3_Iterations,"Model3_Iterations","NO. ITERATIONS MODEL 3","m3","number of iterations for Model 3",PARLEV_ITER,5); -GLOBAL_PARAMETER3(int,Model4_Iterations,"Model4_Iterations","NO. ITERATIONS MODEL 4","m4","number of iterations for Model 4",PARLEV_ITER,5); -GLOBAL_PARAMETER3(int,Model5_Iterations,"Model5_Iterations","NO. ITERATIONS MODEL 5","m5","number of iterations for Model 5",PARLEV_ITER,0); -GLOBAL_PARAMETER3(int,Model6_Iterations,"Model6_Iterations","NO. ITERATIONS MODEL 6","m6","number of iterations for Model 6",PARLEV_ITER,0); - - -GLOBAL_PARAMETER(float, PROB_SMOOTH,"probSmooth","probability smoothing (floor) value ",PARLEV_OPTHEUR,1e-7); -GLOBAL_PARAMETER(float, MINCOUNTINCREASE,"minCountIncrease","minimal count increase",PARLEV_OPTHEUR,1e-7); - -GLOBAL_PARAMETER2(int,Transfer_Dump_Freq,"TRANSFER DUMP FREQUENCY","t2to3","output: dump of transfer from Model 2 to 3",PARLEV_OUTPUT,0); -GLOBAL_PARAMETER2(bool,Verbose,"verbose","v","0: not verbose; 1: verbose",PARLEV_OUTPUT,0); -GLOBAL_PARAMETER(bool,Log,"log","0: no logfile; 1: logfile",PARLEV_OUTPUT,0); - - -GLOBAL_PARAMETER(double,P0,"p0","fixed value for parameter p_0 in IBM-3/4 (if negative then it is determined in training)",PARLEV_EM,-1.0); -GLOBAL_PARAMETER(double,M5P0,"m5p0","fixed value for parameter p_0 in IBM-5 (if negative then it is determined in training)",PARLEV_EM,-1.0); -GLOBAL_PARAMETER3(bool,Peg,"pegging","p","DO PEGGING? (Y/N)","0: no pegging; 1: do pegging",PARLEV_EM,0); - -GLOBAL_PARAMETER(short,OldADBACKOFF,"adbackoff","",-1,0); -GLOBAL_PARAMETER2(unsigned int,MAX_SENTENCE_LENGTH,"ml","MAX SENTENCE LENGTH","maximum sentence length",0,MAX_SENTENCE_LENGTH_ALLOWED); - - -GLOBAL_PARAMETER(short, DeficientDistortionForEmptyWord,"DeficientDistortionForEmptyWord","0: IBM-3/IBM-4 as described in (Brown et al. 1993); 1: distortion model of empty word is deficient; 2: distoriton model of empty word is deficient (differently); setting this parameter also helps to avoid that during IBM-3 and IBM-4 training too many words are aligned with the empty word",PARLEV_MODELS,0); -short OutputInAachenFormat=0; -bool Transfer=TRANSFER; -bool Transfer2to3=0; -short NoEmptyWord=0; -bool FEWDUMPS=0; -GLOBAL_PARAMETER(bool,ONLYALDUMPS,"ONLYALDUMPS","1: do not write any files",PARLEV_OUTPUT,0); -GLOBAL_PARAMETER(short,CompactAlignmentFormat,"CompactAlignmentFormat","0: detailled alignment format, 1: compact alignment format ",PARLEV_OUTPUT,0); -GLOBAL_PARAMETER2(bool,NODUMPS,"NODUMPS","NO FILE DUMPS? (Y/N)","1: do not write any files",PARLEV_OUTPUT,0); - -GLOBAL_PARAMETER(WordIndex,MAX_FERTILITY,"MAX_FERTILITY","maximal fertility for fertility models",PARLEV_EM,10); - -Vector,char > > ReferenceAlignment; - - -bool useDict = false; -string CoocurrenceFile; -string Prefix, LogFilename, OPath, Usage, - SourceVocabFilename, TargetVocabFilename, CorpusFilename, - TestCorpusFilename, t_Filename, a_Filename, p0_Filename, d_Filename, - n_Filename, dictionary_Filename; - -ofstream logmsg ; -const string str2Num(int n){ - string number = ""; - do{ - number.insert((size_t)0, 1, (char)(n % 10 + '0')); - } while((n /= 10) > 0); - return(number) ; -} - - -double LAMBDA=1.09; -sentenceHandler *testCorpus=0,*corpus=0; -Perplexity trainPerp, testPerp, trainViterbiPerp, testViterbiPerp ; - -string ReadTablePrefix; - - -void printGIZAPars(ostream&out) -{ - out << "general parameters:\n" - "-------------------\n"; - printPars(out,getGlobalParSet(),0); - out << '\n'; - - out << "No. of iterations:\n-" - "------------------\n"; - printPars(out,getGlobalParSet(),PARLEV_ITER); - out << '\n'; - - out << "parameter for various heuristics in GIZA++ for efficient training:\n" - "------------------------------------------------------------------\n"; - printPars(out,getGlobalParSet(),PARLEV_OPTHEUR); - out << '\n'; - - out << "parameters for describing the type and amount of output:\n" - "-----------------------------------------------------------\n"; - printPars(out,getGlobalParSet(),PARLEV_OUTPUT); - out << '\n'; - - out << "parameters describing input files:\n" - "----------------------------------\n"; - printPars(out,getGlobalParSet(),PARLEV_INPUT); - out << '\n'; - - out << "smoothing parameters:\n" - "---------------------\n"; - printPars(out,getGlobalParSet(),PARLEV_SMOOTH); - out << '\n'; - - out << "parameters modifying the models:\n" - "--------------------------------\n"; - printPars(out,getGlobalParSet(),PARLEV_MODELS); - out << '\n'; - - out << "parameters modifying the EM-algorithm:\n" - "--------------------------------------\n"; - printPars(out,getGlobalParSet(),PARLEV_EM); - out << '\n'; -} - -const char*stripPath(const char*fullpath) - // strip the path info from the file name -{ - const char *ptr = fullpath + strlen(fullpath) - 1 ; - while(ptr && ptr > fullpath && *ptr != '/'){ptr--;} - if( *ptr=='/' ) - return(ptr+1); - else - return ptr; -} - - -void printDecoderConfigFile() -{ - string decoder_config_file = Prefix + ".Decoder.config" ; - cerr << "writing decoder configuration file to " << decoder_config_file.c_str() <<'\n'; - ofstream decoder(decoder_config_file.c_str()); - if(!decoder){ - cerr << "\nCannot write to " << decoder_config_file <<'\n'; - exit(1); - } - decoder << "# Template for Configuration File for the Rewrite Decoder\n# Syntax:\n" - << "# = \n# '#' is the comment character\n" - << "#================================================================\n" - << "#================================================================\n" - << "# LANGUAGE MODEL FILE\n# The full path and file name of the language model file:\n"; - decoder << "LanguageModelFile =\n"; - decoder << "#================================================================\n" - << "#================================================================\n" - << "# TRANSLATION MODEL FILES\n# The directory where the translation model tables as created\n" - << "# by Giza are located:\n#\n" - << "# Notes: - All translation model \"source\" files are assumed to be in\n" - << "# TM_RawDataDir, the binaries will be put in TM_BinDataDir\n" - << "#\n# - Attention: RELATIVE PATH NAMES DO NOT WORK!!!\n" - << "#\n# - Absolute paths (file name starts with /) will override\n" - << "# the default directory.\n\n"; - // strip file prefix info and leave only the path name in Prefix - string path = Prefix.substr(0, Prefix.find_last_of("/")+1); - if( path=="" ) - path="."; - decoder << "TM_RawDataDir = " << path << '\n'; - decoder << "TM_BinDataDir = " << path << '\n' << '\n'; - decoder << "# file names of the TM tables\n# Notes:\n" - << "# 1. TTable and InversTTable are expected to use word IDs not\n" - << "# strings (Giza produces both, whereby the *.actual.* files\n" - << "# use strings and are THE WRONG CHOICE.\n" - << "# 2. FZeroWords, on the other hand, is a simple list of strings\n" - << "# with one word per line. This file is typically edited\n" - << "# manually. Hoeever, this one listed here is generated by GIZA\n\n"; - - int lastmodel; - if (Model5_Iterations>0) - lastmodel = 5 ; - else if (Model4_Iterations>0) - lastmodel = 4 ; - else if (Model3_Iterations>0) - lastmodel = 3 ; - else if (Model2_Iterations>0) - lastmodel = 2 ; - else lastmodel = 1 ; - string lastModelName = str2Num(lastmodel); - string p=Prefix + ".t" + /*lastModelName*/"3" +".final"; - decoder << "TTable = " << stripPath(p.c_str()) << '\n'; - p = Prefix + ".ti.final" ; - decoder << "InverseTTable = " << stripPath(p.c_str()) << '\n'; - p=Prefix + ".n" + /*lastModelName*/"3" + ".final"; - decoder << "NTable = " << stripPath(p.c_str()) << '\n'; - p=Prefix + ".d" + /*lastModelName*/"3" + ".final"; - decoder << "D3Table = " << stripPath(p.c_str()) << '\n'; - p=Prefix + ".D4.final"; - decoder << "D4Table = " << stripPath(p.c_str()) << '\n'; - p=Prefix + ".p0_"+ /*lastModelName*/"3" + ".final"; - decoder << "PZero = " << stripPath(p.c_str()) << '\n'; - decoder << "Source.vcb = " << SourceVocabFilename << '\n'; - decoder << "Target.vcb = " << TargetVocabFilename << '\n'; - // decoder << "Source.classes = " << SourceVocabFilename + ".classes" << '\n'; - // decoder << "Target.classes = " << TargetVocabFilename + ".classes" <<'\n'; - decoder << "Source.classes = " << SourceVocabFilename+".classes" << '\n'; - decoder << "Target.classes = " << TargetVocabFilename + ".classes" <<'\n'; - p=Prefix + ".fe0_"+ /*lastModelName*/"3" + ".final"; - decoder << "FZeroWords = " <,char >&s,int&number) -{ - string x; - if( !(is >> x) ) return 0; - if( x=="SENT:" ) is >> x; - int n=atoi(x.c_str()); - if( number==-1 ) - number=n; - else - if( number!=n ) - { - cerr << "ERROR: readNextSent: DIFFERENT NUMBERS: " << number << " " << n << '\n'; - return 0; - } - int nS,nP,nO; - nS=nP=nO=0; - while( is >> x ) - { - if( x=="SENT:" ) - return 1; - int n1,n2; - is >> n1 >> n2; - map< pair,char >::const_iterator i=s.find(pair(n1,n2)); - if( i==s.end()||i->second=='P' ) - s[pair(n1,n2)]=x[0]; - massert(x[0]=='S'||x[0]=='P'); - nS+= (x[0]=='S'); - nP+= (x[0]=='P'); - nO+= (!(x[0]=='S'||x[0]=='P')); - } - return 1; -} - -bool emptySent(map< pair,char >&x) -{ - x = map< pair,char >(); - return 1; -} - -void ReadAlignment(const string&x,Vector,char > >&a) -{ - ifstream infile(x.c_str()); - a.clear(); - map< pair,char >sent; - int number=0; - while( emptySent(sent) && (readNextSent(infile,sent,number)) ) - { - if( int(a.size())!=number ) - cerr << "ERROR: ReadAlignment: " << a.size() << " " << number << '\n'; - a.push_back(sent); - number++; - } - cout << "Read: " << a.size() << " sentences in reference alignment." << '\n'; -} - - -void initGlobals(void) -{ - NODUMPS = false ; - Prefix = Get_File_Spec(); - LogFilename= Prefix + ".log"; - MAX_SENTENCE_LENGTH = MAX_SENTENCE_LENGTH_ALLOWED ; -} - -void convert(const map< pair,char >&reference,alignment&x) -{ - int l=x.get_l(); - int m=x.get_m(); - for(map< pair,char >::const_iterator i=reference.begin();i!=reference.end();++i) - { - if( i->first.first+1>int(m) ) - { - cerr << "ERROR m to big: " << i->first.first << " " << i->first.second+1 << " " << l << " " << m << " is wrong.\n"; - continue; - } - if( i->first.second+1>int(l) ) - { - cerr << "ERROR l to big: " << i->first.first << " " << i->first.second+1 << " " << l << " " << m << " is wrong.\n"; - continue; - } - if( x(i->first.first+1)!=0 ) - cerr << "ERROR: position " << i->first.first+1 << " already set\n"; - x.set(i->first.first+1,i->first.second+1); - } -} -double ErrorsInAlignment(const map< pair,char >&reference,const Vector&test,int l,int&missing,int&toomuch,int&eventsMissing,int&eventsToomuch,int pair_no) -{ - int err=0; - for(unsigned int j=1;j0 ) - { - map< pair,char >::const_iterator i=reference.find(make_pair(test[j]-1,j-1)); - if( i==reference.end() ) - { - toomuch++; - err++; - } - else - if( !(i->second=='S' || i->second=='P')) - cerr << "ERROR: wrong symbol in reference alignment '" << i->second << ' ' << int(i->second) << " no:" << pair_no<< "'\n"; - eventsToomuch++; - } - } - for(map< pair,char >::const_iterator i=reference.begin();i!=reference.end();++i) - { - if( i->second=='S' ) - { - unsigned int J=i->first.second+1; - unsigned int I=i->first.first+1; - if( int(J)>=int(test.size())||int(I)>int(l)||int(J)<1||int(I)<1 ) - cerr << "ERROR: alignment outside of range in reference alignment" << J << " " << test.size() << " (" << I << " " << l << ") no:" << pair_no << '\n'; - else - { - if(test[J]!=I) - { - missing++; - err++; - } - } - eventsMissing++; - } - } - if( Verbose ) - cout << err << " errors in sentence\n"; - if( eventsToomuch+eventsMissing ) - return (toomuch+missing)/(eventsToomuch+eventsMissing); - else - return 1.0; -} - - -vcbList *globeTrainVcbList,*globfTrainVcbList; - -double StartTraining(int&result) -{ - double errors=0.0; - vcbList eTrainVcbList, fTrainVcbList; - globeTrainVcbList=&eTrainVcbList; - globfTrainVcbList=&fTrainVcbList; - - - string repFilename = Prefix + ".gizacfg" ; - ofstream of2(repFilename.c_str()); - writeParameters(of2,getGlobalParSet(),-1) ; - - cout << "reading vocabulary files \n"; - eTrainVcbList.setName(SourceVocabFilename.c_str()); - fTrainVcbList.setName(TargetVocabFilename.c_str()); - eTrainVcbList.readVocabList(); - fTrainVcbList.readVocabList(); - cout << "Source vocabulary list has " << eTrainVcbList.uniqTokens() << " unique tokens \n"; - cout << "Target vocabulary list has " << fTrainVcbList.uniqTokens() << " unique tokens \n"; - - vcbList eTestVcbList(eTrainVcbList) ; - vcbList fTestVcbList(fTrainVcbList) ; - - corpus = new sentenceHandler(CorpusFilename.c_str(), &eTrainVcbList, &fTrainVcbList); - - if (TestCorpusFilename == "NONE") - TestCorpusFilename = ""; - - if (TestCorpusFilename != ""){ - cout << "Test corpus will be read from: " << TestCorpusFilename << '\n'; - testCorpus= new sentenceHandler(TestCorpusFilename.c_str(), - &eTestVcbList, &fTestVcbList); - cout << " Test total # sentence pairs : " <<(*testCorpus).getTotalNoPairs1()<<" weighted:"<<(*testCorpus).getTotalNoPairs2() <<'\n'; - - cout << "Size of the source portion of test corpus: " << eTestVcbList.totalVocab() << " tokens\n"; - cout << "Size of the target portion of test corpus: " << fTestVcbList.totalVocab() << " tokens \n"; - cout << "In source portion of the test corpus, only " << eTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; - cout << "In target portion of the test corpus, only " << fTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; - cout << "ratio (target/source) : " << double(fTestVcbList.totalVocab()) / - eTestVcbList.totalVocab() << '\n'; - } - - cout << " Train total # sentence pairs (weighted): " << corpus->getTotalNoPairs2() << '\n'; - cout << "Size of source portion of the training corpus: " << eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2() << " tokens\n"; - cout << "Size of the target portion of the training corpus: " << fTrainVcbList.totalVocab() << " tokens \n"; - cout << "In source portion of the training corpus, only " << eTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; - cout << "In target portion of the training corpus, only " << fTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; - cout << "lambda for PP calculation in IBM-1,IBM-2,HMM:= " << double(fTrainVcbList.totalVocab()) << "/(" << eTrainVcbList.totalVocab() << "-" << corpus->getTotalNoPairs2() << ")="; - LAMBDA = double(fTrainVcbList.totalVocab()) / (eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2()); - cout << "= " << LAMBDA << '\n'; - // load dictionary - Dictionary *dictionary; - useDict = !dictionary_Filename.empty(); - if (useDict) dictionary = new Dictionary(dictionary_Filename.c_str()); - else dictionary = new Dictionary(""); - int minIter=0; -#ifdef BINARY_SEARCH_FOR_TTABLE - if( CoocurrenceFile.length()==0 ) - { - cerr << "ERROR: NO COOCURRENCE FILE GIVEN!\n"; - abort(); - } - //ifstream coocs(CoocurrenceFile.c_str()); - tmodel tTable(CoocurrenceFile); -#else - tmodel tTable; -#endif - - model1 m1(CorpusFilename.c_str(), eTrainVcbList, fTrainVcbList,tTable,trainPerp, - *corpus,&testPerp, testCorpus, - trainViterbiPerp, &testViterbiPerp); - amodel aTable(false); - amodel aCountTable(false); - model2 m2(m1,aTable,aCountTable); - hmm h(m2); - model3 m3(m2); - if(ReadTablePrefix.length() ) - { - string number = "final"; - string tfile,afilennfile,dfile,d4file,p0file,afile,nfile; //d5file - tfile = ReadTablePrefix + ".t3." + number ; - afile = ReadTablePrefix + ".a3." + number ; - nfile = ReadTablePrefix + ".n3." + number ; - dfile = ReadTablePrefix + ".d3." + number ; - d4file = ReadTablePrefix + ".d4." + number ; - //d5file = ReadTablePrefix + ".d5." + number ; - p0file = ReadTablePrefix + ".p0_3." + number ; - tTable.readProbTable(tfile.c_str()); - aTable.readTable(afile.c_str()); - m3.dTable.readTable(dfile.c_str()); - m3.nTable.readNTable(nfile.c_str()); - sentPair sent ; - double p0; - ifstream p0f(p0file.c_str()); - p0f >> p0; - d4model d4m(MAX_SENTENCE_LENGTH); - d4m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes"); - d4m.readProbTable(d4file.c_str()); - //d5model d5m(d4m); - //d5m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes"); - //d5m.readProbTable(d5file.c_str()); - makeSetCommand("model4smoothfactor","0.0",getGlobalParSet(),2); - //makeSetCommand("model5smoothfactor","0.0",getGlobalParSet(),2); - if( corpus||testCorpus ) - { - sentenceHandler *x=corpus; - if(x==0) - x=testCorpus; - cout << "Text corpus exists.\n"; - x->rewind(); - while(x&&x->getNextSentence(sent)){ - Vector& es = sent.eSent; - Vector& fs = sent.fSent; - int l=es.size()-1; - int m=fs.size()-1; - transpair_model4 tm4(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d4m); - alignment al(l,m); - cout << "I use the alignment " << sent.sentenceNo-1 << '\n'; - //convert(ReferenceAlignment[sent.sentenceNo-1],al); - transpair_model3 tm3(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,0); - double p=tm3.prob_of_target_and_alignment_given_source(al,1); - cout << "Sentence " << sent.sentenceNo << " has IBM-3 prob " << p << '\n'; - p=tm4.prob_of_target_and_alignment_given_source(al,3,1); - cout << "Sentence " << sent.sentenceNo << " has IBM-4 prob " << p << '\n'; - //transpair_model5 tm5(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d5m); - //p=tm5.prob_of_target_and_alignment_given_source(al,3,1); - //cout << "Sentence " << sent.sentenceNo << " has IBM-5 prob " << p << '\n'; - } - } - else - { - cout << "No corpus exists.\n"; - } - } - else - { - // initialize model1 - bool seedModel1 = false ; - if(Model1_Iterations > 0){ - if (t_Filename != "NONE" && t_Filename != ""){ - seedModel1 = true ; - m1.load_table(t_Filename.c_str()); - } - minIter=m1.em_with_tricks(Model1_Iterations,seedModel1,*dictionary, useDict); - errors=m1.errorsAL(); - } - - { - if(Model2_Iterations > 0){ - m2.initialize_table_uniformly(*corpus); - minIter=m2.em_with_tricks(Model2_Iterations); - errors=m2.errorsAL(); - } - if(HMM_Iterations > 0){ - cout << "NOTE: I am doing iterations with the HMM model!\n"; - h.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes"); - h.initialize_table_uniformly(*corpus); - minIter=h.em_with_tricks(HMM_Iterations); - errors=h.errorsAL(); - } - - if(Transfer2to3||HMM_Iterations==0){ - if( HMM_Iterations>0 ) - cout << "WARNING: transfor is not needed, as results are overwritten bei transfer from HMM.\n"; - string test_alignfile = Prefix +".tst.A2to3"; - if (testCorpus) - m2.em_loop(testPerp, *testCorpus,Transfer_Dump_Freq==1&&!NODUMPS,test_alignfile.c_str(), testViterbiPerp, true); - if (testCorpus) - cout << "\nTransfer: TEST CROSS-ENTROPY " << testPerp.cross_entropy() << " PERPLEXITY " << testPerp.perplexity() << "\n\n"; - if (Transfer == TRANSFER_SIMPLE) - m3.transferSimple(*corpus, Transfer_Dump_Freq==1&&!NODUMPS,trainPerp, trainViterbiPerp); - else - m3.transfer(*corpus, Transfer_Dump_Freq==1&&!NODUMPS, trainPerp, trainViterbiPerp); - errors=m3.errorsAL(); - } - - if( HMM_Iterations>0 ) - m3.setHMM(&h); - if(Model3_Iterations > 0 || Model4_Iterations > 0 || Model5_Iterations || Model6_Iterations - ) - { - minIter=m3.viterbi(Model3_Iterations,Model4_Iterations,Model5_Iterations,Model6_Iterations); - errors=m3.errorsAL(); - } - if (FEWDUMPS||!NODUMPS) - { - printAllTables(eTrainVcbList,eTestVcbList,fTrainVcbList,fTestVcbList,m1 ); - } - } - } - result=minIter; - return errors; -} - -int main(int argc, char* argv[]) -{ -#ifdef BINARY_SEARCH_FOR_TTABLE - getGlobalParSet().insert(new Parameter("CoocurrenceFile",ParameterChangedFlag,"",CoocurrenceFile,PARLEV_SPECIAL)); -#endif - getGlobalParSet().insert(new Parameter("ReadTablePrefix",ParameterChangedFlag,"optimized",ReadTablePrefix,-1)); - getGlobalParSet().insert(new Parameter("S",ParameterChangedFlag,"source vocabulary file name",SourceVocabFilename,PARLEV_INPUT)); - getGlobalParSet().insert(new Parameter("SOURCE VOCABULARY FILE",ParameterChangedFlag,"source vocabulary file name",SourceVocabFilename,-1)); - getGlobalParSet().insert(new Parameter("T",ParameterChangedFlag,"target vocabulary file name",TargetVocabFilename,PARLEV_INPUT)); - getGlobalParSet().insert(new Parameter("TARGET VOCABULARY FILE",ParameterChangedFlag,"target vocabulary file name",TargetVocabFilename,-1)); - getGlobalParSet().insert(new Parameter("C",ParameterChangedFlag,"training corpus file name",CorpusFilename,PARLEV_INPUT)); - getGlobalParSet().insert(new Parameter("CORPUS FILE",ParameterChangedFlag,"training corpus file name",CorpusFilename,-1)); - getGlobalParSet().insert(new Parameter("TC",ParameterChangedFlag,"test corpus file name",TestCorpusFilename,PARLEV_INPUT)); - getGlobalParSet().insert(new Parameter("TEST CORPUS FILE",ParameterChangedFlag,"test corpus file name",TestCorpusFilename,-1)); - getGlobalParSet().insert(new Parameter("d",ParameterChangedFlag,"dictionary file name",dictionary_Filename,PARLEV_INPUT)); - getGlobalParSet().insert(new Parameter("DICTIONARY",ParameterChangedFlag,"dictionary file name",dictionary_Filename,-1)); - getGlobalParSet().insert(new Parameter("l",ParameterChangedFlag,"log file name",LogFilename,PARLEV_OUTPUT)); - getGlobalParSet().insert(new Parameter("LOG FILE",ParameterChangedFlag,"log file name",LogFilename,-1)); - - getGlobalParSet().insert(new Parameter("o",ParameterChangedFlag,"output file prefix",Prefix,PARLEV_OUTPUT)); - getGlobalParSet().insert(new Parameter("OUTPUT FILE PREFIX",ParameterChangedFlag,"output file prefix",Prefix,-1)); - getGlobalParSet().insert(new Parameter("OUTPUT PATH",ParameterChangedFlag,"output path",OPath,PARLEV_OUTPUT)); - - time_t st1, fn; - st1 = time(NULL); // starting time - - string temp(argv[0]); - Usage = temp + " [options]\n"; - if(argc < 2) - { - printHelp(); - exit(1); - } - - initGlobals() ; - parseArguments(argc, argv); - - if (Log) - logmsg.open(LogFilename.c_str(), ios::out); - - printGIZAPars(cout); - int a=-1; - double errors=0.0; - if( OldADBACKOFF!=0 ) - cerr << "WARNING: Parameter -adBackOff does not exist further; use CompactADTable instead.\n"; - if( MAX_SENTENCE_LENGTH > MAX_SENTENCE_LENGTH_ALLOWED ) - cerr << "ERROR: MAX_SENTENCE_LENGTH is too big " << MAX_SENTENCE_LENGTH << " > " << MAX_SENTENCE_LENGTH_ALLOWED << '\n'; - errors=StartTraining(a); - fn = time(NULL); // finish time - cout << '\n' << "Entire Training took: " << difftime(fn, st1) << " seconds\n"; - cout << "Program Finished at: "<< ctime(&fn) << '\n'; - cout << "==========================================================\n"; - return 0; -} - diff --git a/ext/giza-pp/GIZA++-v2/model1.cpp b/ext/giza-pp/GIZA++-v2/model1.cpp deleted file mode 100644 index b1b6d921..00000000 --- a/ext/giza-pp/GIZA++-v2/model1.cpp +++ /dev/null @@ -1,283 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "model1.h" -#include "Globals.h" -#include "utility.h" -#include "Parameter.h" - -extern short NoEmptyWord; -extern int VerboseSentence; - -GLOBAL_PARAMETER2(int,Model1_Dump_Freq,"MODEL 1 DUMP FREQUENCY","t1","dump frequency of Model 1",PARLEV_OUTPUT,0); -int NumberOfVALIalignments=100; - -model1::model1(const char* efname, vcbList& evcblist, vcbList& fvcblist,tmodel&_tTable,Perplexity& _perp, - sentenceHandler& _sHandler1, - Perplexity* _testPerp, - sentenceHandler* _testHandler, - Perplexity& _trainViterbiPerp, - Perplexity* _testViterbiPerp): - report_info(_perp,_sHandler1,_testPerp,_testHandler,_trainViterbiPerp,_testViterbiPerp), - efFilename(efname), Elist(evcblist), Flist(fvcblist), - eTotalWCount(Elist.totalVocab()), fTotalWCount(Flist.totalVocab()), - noEnglishWords(Elist.size()), noFrenchWords(Flist.size()), tTable(_tTable), - evlist(Elist.getVocabList()), fvlist(Flist.getVocabList()) -{} - -void model1::initialize_table_uniformly(sentenceHandler& sHandler1){ - WordIndex i, j; - - cout << "Initialize tTable\n"; - - sentPair sent ; - sHandler1.rewind(); - while(sHandler1.getNextSentence(sent)){ - Vector& es = sent.eSent; - Vector& fs = sent.fSent; - PROB uniform = 1.0/es.size() ; - for( i=0; i < es.size(); i++) - for(j=1; j < fs.size(); j++) - tTable.insert(es[i],fs[j],0,uniform); - } -} - - -int model1::em_with_tricks(int noIterations, /*Perplexity& perp, sentenceHandler& sHandler1, */ - bool seedModel1, Dictionary& dictionary, bool useDict /*Perplexity* testPerp, sentenceHandler* testHandler, - Perplexity& trainViterbiPerp, Perplexity* testViterbiPerp */ ) -{ - double minErrors=1.0;int minIter=0; - string modelName="Model1",shortModelName="1"; - time_t st, it_st, fn, it_fn; - string tfile, number, alignfile, test_alignfile; - int pair_no; - bool dump_files = false ; - st = time(NULL); - sHandler1.rewind(); - cout << "==========================================================\n"; - cout << modelName << " Training Started at: "<< ctime(&st) << "\n"; - for(int it = 1; it <= noIterations; it++){ - pair_no = 0 ; - it_st = time(NULL); - cout << "-----------\n" << modelName << ": Iteration " << it << '\n'; - dump_files = (Model1_Dump_Freq != 0) && ((it % Model1_Dump_Freq) == 0) && !NODUMPS ; - number = ""; - int n = it; - do{ - number.insert((size_t)0, 1, (char)(n % 10 + '0')); - } while((n /= 10) > 0); - tfile = Prefix + ".t" + shortModelName + "." + number ; - alignfile = Prefix + ".A" + shortModelName + "." + number ; - test_alignfile = Prefix +".tst.A" + shortModelName + "." + number ; - initAL(); - em_loop(it,perp, sHandler1, seedModel1, dump_files, alignfile.c_str(), dictionary, useDict, trainViterbiPerp); - if (testPerp && testHandler) // calculate test perplexity - em_loop(it,*testPerp, *testHandler, seedModel1, dump_files, test_alignfile.c_str(), dictionary, useDict, *testViterbiPerp, true); - if( errorsAL()& es = sent.eSent; - Vector& fs = sent.fSent; - const float so = sent.getCount(); - l = es.size() - 1; - m = fs.size() - 1; - cross_entropy = log(1.0); - Vector viterbi_alignment(fs.size()); - double viterbi_score = 1 ; - - bool eindict[l + 1]; - bool findict[m + 1]; - bool indict[m + 1][l + 1]; - if(it == 1 && useDict){ - for(unsigned int dummy = 0; dummy <= l; dummy++) eindict[dummy] = false; - for(unsigned int dummy = 0; dummy <= m; dummy++){ - findict[dummy] = false; - for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++) - indict[dummy][dummy2] = false; - } - for(j = 0; j <= m; j++) - for(i = 0; i <= l; i++) - if(dict.indict(fs[j], es[i])){ - eindict[i] = findict[j] = indict[j][i] = true; - } - } - - for(j=1; j <= m; j++){ - // entries that map fs to all possible ei in this sentence. - Vector *> sPtrCache(es.size(),0); // cache pointers to table - LpPair **sPtrCachePtr; - - PROB denom = 0.0; - WordIndex best_i = 0 ; // i for which fj is best maped to ei - PROB word_best_score = 0 ; // score for the best mapping of fj - if (it == 1 && !seedModel1){ - denom = uniform * es.size() ; - word_best_score = uniform ; - } - else - for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ - PROB e(0.0) ; - (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ; - if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) - e = (*((*sPtrCachePtr))).prob; - else e = PROB_SMOOTH ; - denom += e ; - if (e > word_best_score){ - word_best_score = e ; - best_i = i ; - } } - viterbi_alignment[j] = best_i ; - viterbi_score *= word_best_score ; /// denom ; - if (denom == 0){ - if (test) - cerr << "WARNING: denom is zero (TEST)\n"; - else - cerr << "WARNING: denom is zero (TRAIN)\n"; - } - cross_entropy += log(denom) ; - if (!test){ - if(denom > 0){ - COUNT val = COUNT(so) / (COUNT) double(denom) ; - /* this if loop implements a constraint on counting: - count(es[i], fs[j]) is implemented if and only if - es[i] and fs[j] occur together in the dictionary, - OR - es[i] does not occur in the dictionary with any fs[x] and - fs[j] does not occur in the dictionary with any es[y] - */ - if(it == 1 && useDict){ - for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ - if(indict[j][i] || (!findict[j] && !eindict[i])){ - PROB e(0.0) ; - if (it == 1 && !seedModel1) - e = uniform ; - else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) - e = (*((*sPtrCachePtr))).prob; - else e = PROB_SMOOTH ; - COUNT x=e*val; - if( it==1||x>MINCOUNTINCREASE ) - if ((*sPtrCachePtr) != 0) - (*((*sPtrCachePtr))).count += x; - else - tTable.incCount(es[i], fs[j], x); - } /* end of if */ - } /* end of for i */ - } /* end of it == 1 */ - // Old code: - else{ - for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ - //for(i=0; i <= l; i++) { - PROB e(0.0) ; - if (it == 1 && !seedModel1) - e = uniform ; - else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) - e = (*((*sPtrCachePtr))).prob; - else e = PROB_SMOOTH ; - //if( !(i==0) ) - //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl; - COUNT x=e*val; - if( pair_no==VerboseSentence ) - cout << i << "(" << evlist[es[i]].word << ")," << j << "(" << fvlist[fs[j]].word << ")=" << x << endl; - if( it==1||x>MINCOUNTINCREASE ) - if( NoEmptyWord==0 || i!=0 ) - if ((*sPtrCachePtr) != 0) - (*((*sPtrCachePtr))).count += x; - else - tTable.incCount(es[i], fs[j], x); - } /* end of for i */ - } // end of else - } // end of if (denom > 0) - }// if (!test) - } // end of for (j) ; - sHandler1.setProbOfSentence(sent,cross_entropy); - //cerr << sent << "CE: " << cross_entropy << " " << so << endl; - perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1); - viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1); - if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000)) - printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score); - addAL(viterbi_alignment,sent.sentenceNo,l); - pair_no++; - } /* of while */ - sHandler1.rewind(); - perp.record("Model1"); - viterbi_perp.record("Model1"); - errorReportAL(cout, "IBM-1"); -} diff --git a/ext/giza-pp/GIZA++-v2/model1.h b/ext/giza-pp/GIZA++-v2/model1.h deleted file mode 100644 index 7273049c..00000000 --- a/ext/giza-pp/GIZA++-v2/model1.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef _model1_h -#define _model1_h 1 - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "Vector.h" -#include "vocab.h" -#include "TTables.h" -#include "getSentence.h" -#include "Perplexity.h" -#include "vocab.h" -#include "Dictionary.h" - -extern int NumberOfVALIalignments; - -class report_info{ - protected: - Perplexity& perp; - sentenceHandler& sHandler1; - Perplexity* testPerp; - sentenceHandler* testHandler; - Perplexity& trainViterbiPerp; - Perplexity* testViterbiPerp; - report_info(Perplexity& _perp, - sentenceHandler& _sHandler1, - Perplexity* _testPerp, - sentenceHandler* _testHandler, - Perplexity& _trainViterbiPerp, - Perplexity* _testViterbiPerp) - : perp(_perp),sHandler1(_sHandler1),testPerp(_testPerp),testHandler(_testHandler),trainViterbiPerp(_trainViterbiPerp),testViterbiPerp(_testViterbiPerp) - {} -}; - -class model1 : public report_info{ -public: - string efFilename; - vcbList& Elist ; - vcbList& Flist ; - double eTotalWCount ; // size of source copus in number of words - double fTotalWCount ; // size of target corpus in number of words - int noEnglishWords; - int noFrenchWords; - tmodel&tTable; - Vector& evlist ; - Vector& fvlist ; -public: - int ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch; - int ALmissingVALI,ALtoomuchVALI,ALeventsMissingVALI,ALeventsToomuchVALI; - int ALmissingTEST,ALtoomuchTEST,ALeventsMissingTEST,ALeventsToomuchTEST; - model1 (const char* efname, vcbList& evcblist, vcbList& fvcblist,tmodel&_tTable,Perplexity& _perp, - sentenceHandler& _sHandler1, - Perplexity* _testPerp, - sentenceHandler* _testHandler, - Perplexity& _trainViterbiPerp, - Perplexity* _testViterbiPerp); - void initialize_table_uniformly(sentenceHandler& sHandler1); - int em_with_tricks(int noIterations, - bool seedModel1, Dictionary& dictionary, bool useDict); - void load_table(const char* tname); - void readVocabFile(const char* fname, Vector& vlist, int& vsize, - int& total); - inline Vector& getEnglishVocabList(void)const {return Elist.getVocabList();}; - inline Vector& getFrenchVocabList(void)const {return Flist.getVocabList();}; - inline double getETotalWCount(void) const {return eTotalWCount;}; - inline double getFTotalWCount(void) const {return fTotalWCount;}; - inline int getNoEnglishWords(void) const {return noEnglishWords;}; - inline int getNoFrenchWords(void) const {return noFrenchWords;}; - inline tmodel& getTTable(void) {return tTable;}; - inline string& getEFFilename(void) {return efFilename;}; - private: - void em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, bool , const char*, Dictionary& dictionary, bool useDict, - Perplexity& viterbiperp, bool=false); - friend class model2; - friend class hmm; - public: - void addAL(const Vector& viterbi_alignment,int pair_no,int l) - { - if( pair_no<=int(ReferenceAlignment.size()) ) - { - //cerr << "AL: " << viterbi_alignment << " " << pair_no << endl; - ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch,pair_no); - if( pair_no<=NumberOfVALIalignments ) - ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissingVALI,ALtoomuchVALI,ALeventsMissingVALI,ALeventsToomuchVALI,pair_no); - if( pair_no>NumberOfVALIalignments ) - ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissingTEST,ALtoomuchTEST,ALeventsMissingTEST,ALeventsToomuchTEST,pair_no); - } - } - void initAL() - {ALmissingVALI=ALtoomuchVALI=ALeventsMissingVALI=ALeventsToomuchVALI=ALmissingTEST=ALtoomuchTEST=ALeventsMissingTEST=ALeventsToomuchTEST=ALmissing=ALtoomuch=ALeventsMissing=ALeventsToomuch=0;} - double errorsAL()const - { - if( ALeventsMissingVALI+ALeventsToomuchVALI ) - return (ALmissingVALI+ALtoomuchVALI)/double(ALeventsMissingVALI+ALeventsToomuchVALI); - else - return 0.0; - } - void errorReportAL(ostream&out,string m)const - { - if( ALeventsMissing+ALeventsToomuch ) - out << "alignmentErrors (" << m << "): " - << 100.0*(ALmissing+ALtoomuch)/double(ALeventsMissing+ALeventsToomuch) - << " recall: " << 100.0*(1.0-ALmissing/double(ALeventsMissing)) - << " precision: " << 100.0*(1.0-ALtoomuch/double(ALeventsToomuch)) - << " (missing:" << ALmissing << "/" << ALeventsMissing << " " << ALtoomuch - << " " << ALeventsToomuch << ")\n"; - if( ALeventsMissingVALI+ALeventsToomuchVALI ) - out << "alignmentErrors VALI (" << m << "): " - << 100.0*(ALmissingVALI+ALtoomuchVALI)/double(ALeventsMissingVALI+ALeventsToomuchVALI) - << " recall: " << 100.0*(1.0-ALmissingVALI/double(ALeventsMissingVALI)) - << " precision: " << 100.0*(1.0-ALtoomuchVALI/double(ALeventsToomuchVALI)) - << " (missing:" << ALmissingVALI << "/" << ALeventsMissingVALI << " " << ALtoomuchVALI - << " " << ALeventsToomuchVALI << ")\n"; - if( ALeventsMissingTEST+ALeventsToomuchTEST ) - out << "alignmentErrors TEST(" << m << "): " - << 100.0*(ALmissingTEST+ALtoomuchTEST)/double(ALeventsMissingTEST+ALeventsToomuchTEST) - << " recall: " << 100.0*(1.0-ALmissingTEST/double(ALeventsMissingTEST)) - << " precision: " << 100.0*(1.0-ALtoomuchTEST/double(ALeventsToomuchTEST)) - << " (missing:" << ALmissingTEST << "/" << ALeventsMissingTEST << " " << ALtoomuchTEST - << " " << ALeventsToomuchTEST << ")\n"; - - } -}; - -#endif diff --git a/ext/giza-pp/GIZA++-v2/model2.cpp b/ext/giza-pp/GIZA++-v2/model2.cpp deleted file mode 100644 index 945b91e0..00000000 --- a/ext/giza-pp/GIZA++-v2/model2.cpp +++ /dev/null @@ -1,232 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "model2.h" -#include "Globals.h" -#include "utility.h" -#include "Parameter.h" -#include "defs.h" - -extern short NoEmptyWord; - - -GLOBAL_PARAMETER2(int,Model2_Dump_Freq,"MODEL 2 DUMP FREQUENCY","t2","dump frequency of Model 2",PARLEV_OUTPUT,0); - -model2::model2(model1& m,amodel&_aTable,amodel&_aCountTable): - model1(m),aTable(_aTable),aCountTable(_aCountTable) -{ } - -void model2::initialize_table_uniformly(sentenceHandler& sHandler1){ - // initialize the aTable uniformly (run this before running em_with_tricks) - int n=0; - sentPair sent ; - sHandler1.rewind(); - while(sHandler1.getNextSentence(sent)){ - Vector& es = sent.eSent; - Vector& fs = sent.fSent; - WordIndex l = es.size() - 1; - WordIndex m = fs.size() - 1; - n++; - if(1<=m&&aTable.getValue(l,m,l,m)<=PROB_SMOOTH) - { - PROB uniform_val = 1.0 / (l+1) ; - for(WordIndex j=1; j <= m; j++) - for(WordIndex i=0; i <= l; i++) - aTable.setValue(i,j, l, m, uniform_val); - } - } -} - -int model2::em_with_tricks(int noIterations) -{ - double minErrors=1.0;int minIter=0; - string modelName="Model2",shortModelName="2"; - time_t it_st, st, it_fn, fn; - string tfile, afile, number, alignfile, test_alignfile; - int pair_no = 0; - bool dump_files = false ; - ofstream of2 ; - st = time(NULL) ; - sHandler1.rewind(); - cout << "\n==========================================================\n"; - cout << modelName << " Training Started at: " << ctime(&st) << " iter: " << noIterations << "\n"; - for(int it=1; it <= noIterations ; it++){ - pair_no = 0; - it_st = time(NULL) ; - cout << endl << "-----------\n" << modelName << ": Iteration " << it << '\n'; - dump_files = (Model2_Dump_Freq != 0) && ((it % Model2_Dump_Freq) == 0) && !NODUMPS; - number = ""; - int n = it; - do{ - number.insert((size_t)0, 1, (char)(n % 10 + '0')); - } while((n /= 10) > 0); - tfile = Prefix + ".t" + shortModelName + "." + number ; - afile = Prefix + ".a" + shortModelName + "." + number ; - alignfile = Prefix + ".A" + shortModelName + "." + number ; - test_alignfile = Prefix + ".tst.A" + shortModelName + "." + number ; - aCountTable.clear(); - initAL(); - em_loop(perp, sHandler1, dump_files, alignfile.c_str(), trainViterbiPerp, false); - if( errorsAL()cross_entropy() - << " PERPLEXITY " << testViterbiPerp->perplexity() - << '\n'; - if (dump_files) - { - if(OutputInAachenFormat==0) - tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat); - aCountTable.printTable(afile.c_str()); - } - it_fn = time(NULL) ; - cout << modelName << " Iteration: " << it<< " took: " << difftime(it_fn, it_st) << " seconds\n"; - } // end of iterations - aCountTable.clear(); - fn = time(NULL) ; - cout << endl << "Entire " << modelName << " Training took: " << difftime(fn, st) << " seconds\n"; - // cout << "tTable contains " << tTable.getHash().bucket_count() - // << " buckets and " << tTable.getHash().size() << " entries." ; - cout << "==========================================================\n"; - return minIter; -} - -void model2::load_table(const char* aname){ - /* This function loads the a table from the given file; use it - when you want to load results from previous a training without - doing any new training. - NAS, 7/11/99 - */ - cout << "Model2: loading a table \n"; - aTable.readTable(aname); -} - - -void model2::em_loop(Perplexity& perp, sentenceHandler& sHandler1, - bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp, - bool test) -{ - massert( aTable.is_distortion==0 ); - massert( aCountTable.is_distortion==0 ); - WordIndex i, j, l, m ; - double cross_entropy; - int pair_no=0 ; - perp.clear(); - viterbi_perp.clear(); - ofstream of2; - // for each sentence pair in the corpus - if (dump_alignment||FEWDUMPS ) - of2.open(alignfile); - sentPair sent ; - - vector ferts(evlist.size()); - - sHandler1.rewind(); - while(sHandler1.getNextSentence(sent)){ - Vector& es = sent.eSent; - Vector& fs = sent.fSent; - const float so = sent.getCount(); - l = es.size() - 1; - m = fs.size() - 1; - cross_entropy = log(1.0); - Vector viterbi_alignment(fs.size()); - double viterbi_score = 1; - for(j=1; j <= m; j++){ - Vector *> sPtrCache(es.size(),0); // cache pointers to table - // entries that map fs to all possible ei in this sentence. - PROB denom = 0.0; - PROB e = 0.0, word_best_score = 0; - WordIndex best_i = 0 ; // i for which fj is best maped to ei - for(i=0; i <= l; i++){ - sPtrCache[i] = tTable.getPtr(es[i], fs[j]) ; - if (sPtrCache[i] != 0 &&(*(sPtrCache[i])).prob > PROB_SMOOTH ) - e = (*(sPtrCache[i])).prob * aTable.getValue(i,j, l, m) ; - else e = PROB_SMOOTH * aTable.getValue(i,j, l, m); - denom += e ; - if (e > word_best_score){ - word_best_score = e ; - best_i = i ; - } - } - viterbi_alignment[j] = best_i ; - viterbi_score *= word_best_score; ///denom ; - cross_entropy += log(denom) ; - if (denom == 0){ - if (test) - cerr << "WARNING: denom is zero (TEST)\n"; - else - cerr << "WARNING: denom is zero (TRAIN)\n"; - } - if (!test){ - if(denom > 0){ - COUNT val = COUNT(so) / (COUNT) double(denom) ; - for( i=0; i <= l; i++){ - PROB e(0.0); - if (sPtrCache[i] != 0 && (*(sPtrCache[i])).prob > PROB_SMOOTH) - e = (*(sPtrCache[i])).prob ; - else e = PROB_SMOOTH ; - e *= aTable.getValue(i,j, l, m); - COUNT temp = COUNT(e) * val ; - if( NoEmptyWord==0 || i!=0 ) - if (sPtrCache[i] != 0) - (*(sPtrCache[i])).count += temp ; - else - tTable.incCount(es[i], fs[j], temp); - aCountTable.getRef(i,j, l, m)+= temp ; - } /* end of for i */ - } // end of if (denom > 0) - }// if (!test) - } // end of for (j) ; - sHandler1.setProbOfSentence(sent,cross_entropy); - perp.addFactor(cross_entropy, so, l, m,1); - viterbi_perp.addFactor(log(viterbi_score), so, l, m,1); - if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000) ) - printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.sentenceNo, viterbi_score); - addAL(viterbi_alignment,sent.sentenceNo,l); - pair_no++; - } /* of while */ - sHandler1.rewind(); - perp.record("Model2"); - viterbi_perp.record("Model2"); - errorReportAL(cout,"IBM-2"); -} - - - - - diff --git a/ext/giza-pp/GIZA++-v2/model2.h b/ext/giza-pp/GIZA++-v2/model2.h deleted file mode 100644 index ada807e3..00000000 --- a/ext/giza-pp/GIZA++-v2/model2.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef _model2_h -#define _model2_h 1 - -#include - -#include -#include -#include -#include -#include -#include "Vector.h" -#include - -#include -#include -#include - -#include "TTables.h" -#include "ATables.h" -#include "getSentence.h" -#include "defs.h" -#include "model1.h" -#include "Perplexity.h" -#include "vocab.h" - -class model2 : public model1 -{ - public: - amodel&aTable; - amodel&aCountTable; - public: - model2(model1& m1,amodel&,amodel&); - void initialize_table_uniformly(sentenceHandler&); - int em_with_tricks(int); - void load_table(const char* aname); - inline amodel& getATable(void) {return aTable;}; - inline amodel& getACountTable(void) {return aCountTable;}; - void em_loop(Perplexity& perp,sentenceHandler& sHandler1, bool dump_files,const char* alignfile, Perplexity&, bool test); - friend class model3; -}; - -#endif diff --git a/ext/giza-pp/GIZA++-v2/model2to3.cpp b/ext/giza-pp/GIZA++-v2/model2to3.cpp deleted file mode 100644 index 22cbf502..00000000 --- a/ext/giza-pp/GIZA++-v2/model2to3.cpp +++ /dev/null @@ -1,398 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "model3.h" -#include "utility.h" -#include "Globals.h" - -#define _MAX_FERTILITY 10 - -double get_sum_of_partitions(int n, int source_pos, double alpha[_MAX_FERTILITY][MAX_SENTENCE_LENGTH_ALLOWED]) -{ - int done, init ; - double sum = 0, prod ; - int s, w, u, v; - WordIndex k, k1, i ; - WordIndex num_parts = 0 ; - int total_partitions_considered = 0; - - int part[_MAX_FERTILITY], mult[_MAX_FERTILITY]; - - done = false ; - init = true ; - for (i = 0 ; i < _MAX_FERTILITY ; i++){ - part[i] = mult[i] = 0 ; - } - - //printf("Entering get sum of partitions\n"); - while(! done){ - total_partitions_considered++; - if (init){ - part[1] = n ; - mult[1] = 1 ; - num_parts = 1 ; - init = false ; - } - else { - if ((part[num_parts] > 1) || (num_parts > 1)){ - if (part[num_parts] == 1){ - s = part[num_parts-1] + mult[num_parts]; - k = num_parts - 1; - } - else { - s = part[num_parts]; - k = num_parts ; - } - w = part[k] - 1 ; - u = s / w ; - v = s % w ; - mult[k] -= 1 ; - if (mult[k] == 0) - k1 = k ; - else k1 = k + 1 ; - mult[k1] = u ; - part[k1] = w ; - if (v == 0){ - num_parts = k1 ; - } - else { - mult[k1+1] = 1 ; - part[k1+1] = v ; - num_parts = k1 + 1; - } - } /* of if num_parts > 1 || part[num_parts] > 1 */ - else { - done = true ; - } - } - /* of else of if(init) */ - if (!done){ - prod = 1.0 ; - if (n != 0) - for (i = 1 ; i <= num_parts ; i++){ - prod *= pow(alpha[part[i]][source_pos], mult[i]) / factorial(mult[i]) ; - } - sum += prod ; - } - } /* of while */ - if (sum < 0) sum = 0 ; - return(sum) ; -} - -void model3::estimate_t_a_d(sentenceHandler& sHandler1, Perplexity& perp, Perplexity& trainVPerp, - bool simple, bool dump_files,bool updateT) -{ - string tfile, nfile, dfile, p0file, afile, alignfile; - WordIndex i, j, l, m, max_fertility_here, k ; - PROB val, temp_mult[MAX_SENTENCE_LENGTH_ALLOWED][MAX_SENTENCE_LENGTH_ALLOWED]; - double cross_entropy; - double beta, sum, - alpha[_MAX_FERTILITY][MAX_SENTENCE_LENGTH_ALLOWED]; - double total, temp, r ; - - dCountTable.clear(); - aCountTable.clear(); - initAL(); - nCountTable.clear() ; - if (simple) - nTable.clear(); - perp.clear() ; - trainVPerp.clear() ; - ofstream of2; - if (dump_files){ - alignfile = Prefix +".A2to3"; - of2.open(alignfile.c_str()); - } - if (simple) cerr <<"Using simple estimation for fertilties\n"; - sHandler1.rewind() ; - sentPair sent ; - while(sHandler1.getNextSentence(sent)){ - Vector& es = sent.eSent; - Vector& fs = sent.fSent; - const float count = sent.getCount(); - Vector viterbi_alignment(fs.size()); - l = es.size() - 1; - m = fs.size() - 1; - cross_entropy = log(1.0); - double viterbi_score = 1 ; - PROB word_best_score ; // score for the best mapping of fj - for(j = 1 ; j <= m ; j++){ - word_best_score = 0 ; // score for the best mapping of fj - Vector *> sPtrCache(es.size(),0); - total = 0 ; - WordIndex best_i = 0 ; - for(i = 0; i <= l ; i++){ - sPtrCache[i] = tTable.getPtr(es[i], fs[j]) ; - if (sPtrCache[i] != 0 && (*(sPtrCache[i])).prob > PROB_SMOOTH) // if valid pointer - temp_mult[i][j]= (*(sPtrCache[i])).prob * aTable.getValue(i, j, l, m) ; - else - temp_mult[i][j] = PROB_SMOOTH * aTable.getValue(i, j, l, m) ; - total += temp_mult[i][j] ; - if (temp_mult[i][j] > word_best_score){ - word_best_score = temp_mult[i][j] ; - best_i = i ; - } - } // end of for (i) - viterbi_alignment[j] = best_i ; - viterbi_score *= word_best_score ; /// total ; - cross_entropy += log(total) ; - if (total == 0){ - cerr << "WARNING: total is zero (TRAIN)\n"; - viterbi_score = 0 ; - } - if (total > 0){ - for(i = 0; i <= l ; i++){ - temp_mult[i][j] /= total ; - if (temp_mult[i][j] == 1) // smooth to prevent underflow - temp_mult[i][j] = 0.99 ; - else if (temp_mult[i][j] == 0) - temp_mult[i][j] = PROB_SMOOTH ; - val = temp_mult[i][j] * PROB(count) ; - if ( val > PROB_SMOOTH) { - if( updateT ) - { - if (sPtrCache[i] != 0) - (*(sPtrCache[i])).count += val ; - else - tTable.incCount(es[i], fs[j], val); - } - aCountTable.getRef(i, j, l, m)+=val; - if (0 != i) - dCountTable.getRef(j, i, l, m)+=val; - } - } // for (i = ..) - } // for (if total ...) - } // end of for (j ...) - if (dump_files) - printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.sentenceNo, viterbi_score); - addAL(viterbi_alignment,sent.sentenceNo,l); - if (!simple){ - max_fertility_here = min(WordIndex(m+1), MAX_FERTILITY); - for (i = 1; i <= l ; i++) { - for ( k = 1; k < max_fertility_here; k++){ - beta = 0 ; - alpha[k][i] = 0 ; - for (j = 1 ; j <= m ; j++){ - temp = temp_mult[i][j]; - if (temp > 0.95) temp = 0.95; // smooth to prevent under/over flow - else if (temp < 0.05) temp = 0.05; - beta += pow(temp/(1.0-temp), (double) k); - } - alpha[k][i] = beta * pow((double) -1, (double) (k+1)) / (double) k ; - } - } - for (i = 1; i <= l ; i++){ - r = 1; - for (j = 1 ; j <= m ; j++) - r *= (1 - temp_mult[i][j]); - for (k = 0 ; k < max_fertility_here ; k++){ - sum = get_sum_of_partitions(k, i, alpha); - temp = r * sum * count; - nCountTable.getRef(es[i], k)+=temp; - } // end of for (k ..) - } // end of for (i == ..) - } // end of if (!simple) - perp.addFactor(cross_entropy, count, l, m,1); - trainVPerp.addFactor(log(viterbi_score), count, l, m,1); - } // end of while - sHandler1.rewind(); - cerr << "Normalizing t, a, d, n count tables now ... " ; - if( dump_files && OutputInAachenFormat==1 ) - { - tfile = Prefix + ".t2to3" ; - tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1); - } - if( updateT ) - tTable.normalizeTable(Elist, Flist); - aCountTable.normalize(aTable); - dCountTable.normalize(dTable); - if (!simple) - nCountTable.normalize(nTable,&Elist.getVocabList()); - else { - for (i = 0 ; i< Elist.uniqTokens() ; i++){ - if (0 < MAX_FERTILITY){ - nTable.getRef(i,0)=PROB(0.2); - if (1 < MAX_FERTILITY){ - nTable.getRef(i,1)=PROB(0.65); - if (2 < MAX_FERTILITY){ - nTable.getRef(i,2)=PROB(0.1); - if (3 < MAX_FERTILITY) - nTable.getRef(i,3)=PROB(0.04); - PROB val = 0.01/(MAX_FERTILITY-4); - for (k = 4 ; k < MAX_FERTILITY ; k++) - nTable.getRef(i, k)=val; - } - } - } - } - } // end of else (!simple) - p0 = 0.95; - p1 = 0.05; - if (dump_files){ - tfile = Prefix + ".t2to3" ; - afile = Prefix + ".a2to3" ; - nfile = Prefix + ".n2to3" ; - dfile = Prefix + ".d2to3" ; - p0file = Prefix + ".p0_2to3" ; - - if( OutputInAachenFormat==0 ) - tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat); - aTable.printTable(afile.c_str()); - dTable.printTable(dfile.c_str()); - nCountTable.printNTable(Elist.uniqTokens(), nfile.c_str(), Elist.getVocabList(),OutputInAachenFormat); - ofstream of(p0file.c_str()); - of << p0; - of.close(); - } - errorReportAL(cerr,"IBM-2"); - if(simple) - { - perp.record("T2To3"); - trainVPerp.record("T2To3"); - } - else - { - perp.record("ST2To3"); - trainVPerp.record("ST2To3"); - } -} - -void model3::transferSimple(/*model1& m1, model2& m2, */ sentenceHandler& sHandler1, - bool dump_files, Perplexity& perp, Perplexity& trainVPerp,bool updateT) -{ - /* - This function performs simple Model 2 -> Model 3 transfer. - It sets values for n and p without considering Model 2's ideas. - It sets d values based on a. - */ - time_t st, fn; - // just inherit these from the previous models, to avoid data duplication - - st = time(NULL); - cerr << "==========================================================\n"; - cerr << "\nTransfer started at: "<< ctime(&st) << '\n'; - - cerr << "Simple tranfer of Model2 --> Model3 (i.e. estimating initial parameters of Model3 from Model2 tables)\n"; - - estimate_t_a_d(sHandler1, perp, trainVPerp, true, dump_files,updateT) ; - fn = time(NULL) ; - cerr << "\nTransfer: TRAIN CROSS-ENTROPY " << perp.cross_entropy() - << " PERPLEXITY " << perp.perplexity() << '\n'; - cerr << "\nTransfer took: " << difftime(fn, st) << " seconds\n"; - cerr << "\nTransfer Finished at: "<< ctime(&fn) << '\n'; - cerr << "==========================================================\n"; - -} - - -void model3::transfer(sentenceHandler& sHandler1,bool dump_files, Perplexity& perp, Perplexity& trainVPerp,bool updateT) -{ - if (Transfer == TRANSFER_SIMPLE) - transferSimple(sHandler1,dump_files,perp, trainVPerp,updateT); - { - time_t st, fn ; - - st = time(NULL); - cerr << "==========================================================\n"; - cerr << "\nTransfer started at: "<< ctime(&st) << '\n'; - cerr << "Transfering Model2 --> Model3 (i.e. estimating initial parameters of Model3 from Model2 tables)\n"; - - p1_count = p0_count = 0 ; - - estimate_t_a_d(sHandler1, perp, trainVPerp, false, dump_files,updateT); - - - - /* Below is a made-up stab at transferring t & a probs to p0/p1. - (Method not documented in IBM paper). - It seems to give p0 = .96, which may be right for Model 2, or may not. - I'm commenting it out for now and hardwiring p0 = .90 as above. -Kevin - - // compute p0, p1 counts - Vector nm(Elist.uniqTokens(),0.0); - - for(i=0; i < Elist.uniqTokens(); i++){ - for(k=1; k < MAX_FERTILITY; k++){ - nm[i] += nTable.getValue(i, k) * (LogProb) k; - } - } - - LogProb mprime; - // sentenceHandler sHandler1(efFilename.c_str()); - // sentPair sent ; - - while(sHandler1.getNextSentence(sent)){ - Vector& es = sent.eSent; - Vector& fs = sent.fSent; - const float count = sent.noOccurrences; - - l = es.size() - 1; - m = fs.size() - 1; - mprime = 0 ; - for (i = 1; i <= l ; i++){ - mprime += nm[es[i]] ; - } - mprime = LogProb((int((double) mprime + 0.5))); // round mprime to nearest integer - if ((mprime < m) && (2 * mprime >= m)) { - // cerr << "updating both p0_count and p1_count, mprime: " << mprime << - // "m = " << m << "\n"; - p1_count += (m - (double) mprime) * count ; - p0_count += (2 * (double) mprime - m) * count ; - // cerr << "p0_count = "<1}&16:l, &32:m, &64:F, &128:E",PARLEV_MODELS,76); -GLOBAL_PARAMETER(int,M5_Dependencies,"depm5","d_{=1}: &1:l, &2:m, &4:F, &8:E, d_{>1}&16:l, &32:m, &64:F, &128:E",PARLEV_MODELS,68); -GLOBAL_PARAMETER4(int,Model3_Dump_Freq,"MODEL 345 DUMP FREQUENCY","MODEL 3 DUMP FREQUENCY","t3","t345","dump frequency of Model 3/4/5",PARLEV_OUTPUT,0); - - -extern int Transfer_Dump_Freq; - -model3::model3(model2& m2) : - model2(m2),dTable(true), dCountTable(true), - nTable(m2.getNoEnglishWords()+1, MAX_FERTILITY), - nCountTable(m2.getNoEnglishWords()+1, MAX_FERTILITY),h(0) -{} - -void model3::load_tables(const char *nfile, const char *dfile, const char *p0file){ - cout << "Model3: loading n, d, p0 tables \n"; - - nTable.readNTable(nfile); - dTable.readTable(dfile); - ifstream inf(p0file); - if( !inf ) - cerr << "Can not open: " << p0file << '\n'; - else - { - cout << "Reading p0 value from " << p0file << "\n"; - inf >> p0; - inf.close(); - p1 = 1 - p0; - } - cout << "p0 is: " << p0 << " p1:" << p1 << '\n'; -} - -model3::~model3() -{ - dTable.clear(); - dCountTable.clear(); - nTable.clear(); - nCountTable.clear(); -} - - -void model3::em(int noIterations, sentenceHandler& sHandler1) -{ - - LogProb all_prob, aprob, temp ; - WordIndex i, j, l, m ; - time_t it_st, st, it_fn, fn ; - string tfile, dfile, nfile, p0file, afile, number; - - st = time(NULL) ; - if (Log) - logmsg << "\n" << "Starting Model3: Training"; - cout << "\n" << "Starting Model3: Training"; - // sentenceHandler sHandler1(efFilename.c_str()); - sHandler1.rewind(); - for(int it=1; it <= noIterations; it++){ - it_st = time(NULL) ; - if (Log) - logmsg << "\n" << "Model3: Iteration " << it; - cout << "\n" << "Model3: Iteration " << it; - - // set up the names of the files where the tables will be printed - int n = it; - number = ""; - do{ - //mj changed next line - number.insert((size_t) 0, 1, (char)(n % 10 + '0')); - } while((n /= 10) > 0); - tfile = Prefix + ".t3." + number ; - afile = Prefix + ".a3." + number ; - nfile = Prefix + ".n3." + number ; - dfile = Prefix + ".d3." + number ; - p0file = Prefix + ".p0_3." + number ; - // tCountTable.clear(); - dCountTable.clear(); - nCountTable.clear(); - p0_count = p1_count = 0 ; - all_prob = 0 ; - sentPair sent ; - while(sHandler1.getNextSentence(sent)){ - Vector& es = sent.eSent; - Vector& fs = sent.fSent; - const float count = sent.getCount(); - if ((sent.sentenceNo % 1000) == 0) - cout < A(fs.size(),/*-1*/0); - Vector Fert(es.size(),0); - LogProb lcount=(LogProb)count; - l = es.size()-1; - m = fs.size()-1; - WordIndex x, y ; - all_prob = prob_of_target_given_source(tTable, fs, es); - if (all_prob == 0) - cout << "\n" <<"all_prob = 0"; - - for ( x = 0 ; x < pow(l+1.0, double(m)) ; x++){ // For all possible alignmets A - y = x ; - for (j = 1 ; j <= m ; j++){ - A[j] = y % (l+1) ; - y /= (l+1) ; - } - for(i = 0 ; i <= l ; i++) - Fert[i] = 0 ; - for (j = 1 ; j <= m ; j++) - Fert[A[j]]++; - if (2 * Fert[0] <= m){ /* consider alignments that has Fert[0] less than - half the number of words in French sentence */ - aprob = prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es); - temp = aprob/all_prob ; - LogProb templcount = temp*lcount; - - for (j = 1 ; j <= m ; j++){ - tTable.incCount(es[A[j]], fs[j], templcount); - if (0 != A[j]) - dCountTable.getRef(j, A[j], l, m)+=templcount; - } - for(i = 0 ; i <= l ; i++) - { - nCountTable.getRef(es[i], Fert[i])+=templcount; - //cout << "AFTER INC2: " << templcount << " " << nCountTable.getRef(es[i], Fert[i]) << '\n'; - } - p1_count += double(temp) * (Fert[0] * count) ; - p0_count += double(temp) * ((m - 2 * Fert[0]) * count) ; - } - } /* of looping over all alignments */ - } /* of sentence pair E, F */ - sHandler1.rewind(); - - // normalize tables - if( OutputInAachenFormat==1 ) - tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1); - tTable.normalizeTable(Elist, Flist); - aCountTable.normalize(aTable); - dCountTable.normalize(dTable); - nCountTable.normalize(nTable,&Elist.getVocabList()); - - // normalize p1 & p0 - - if (p1_count + p0_count != 0){ - p1 = p1_count / ( p1_count + p0_count ) ; - p0 = 1 - p1 ; - } - else { - p1 = p0 = 0 ; - } - // print tables - if( OutputInAachenFormat==0 ) - tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat); - dTable.printTable(dfile.c_str()); - nTable.printNTable(Elist.uniqTokens(), nfile.c_str(), Elist.getVocabList(),OutputInAachenFormat); - ofstream of(p0file.c_str()); - of << p0; - of.close(); - it_fn = time(NULL) ; - cout << "\n" << "Model3 Iteration "< vac(m+1,0); - for(PositionIndex i=1;i<=l;i++) - { - PositionIndex cur_j=al.als_i[i]; - cout << "LOOP: " << i << " " << cur_j << '\n'; - PositionIndex prev_j=0; - PositionIndex k=0; - if(cur_j) { // process first word of cept - k++; - vac_all--; - assert(vac[cur_j]==0); - vac[cur_j]=1; - for(unsigned int q=0;q1(" << vacancies(vac,cur_j) << "-" << vprev << "|" << vac_all<< "+" << -al.fert(i)<< "+" << +k << ")\n" << '\n'; - prev_j=cur_j; - cur_j=al.als_j[cur_j].next; - } - assert(k==al.fert(i)); - if( k ) - prev_cept=i; - } - assert(vac_all==al.fert(0)); -} -*/ - -extern short DoViterbiTraining; - -int model3::viterbi(int noIterationsModel3, int noIterationsModel4,int noIterationsModel5,int noIterationsModel6) -{ - double minErrors=1.0;int minIter=0; - d4model d4m(MAX_SENTENCE_LENGTH); - d4m.makeWordClasses(Elist,Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes"); - d5model d5m(d4m); - d5m.makeWordClasses(Elist,Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes"); - time_t it_st, st, it_fn, fn; - bool dump_files = false ; - string tfile, tfile_actual, dfile, afile, nfile, nfile_actual, p0file, alignfile, number, test_alignfile, d4file,d5file,zeroFertFile; - st = time(NULL); - sHandler1.rewind(); - if (testPerp && testHandler) - (*testHandler).rewind(); - string trainingString; - trainingString+=(h?'H':'3'); - for(int i=0;i 0); - if( final ) - number="final"; - tfile = Prefix + ".t3." + number ; - tfile_actual = Prefix + ".actual.t3." + number ; - afile = Prefix + ".a3." + number ; - nfile = Prefix + ".n3." + number ; - nfile_actual = Prefix + ".actual.n3." + number ; - dfile = Prefix + ".d3." + number ; - d4file = Prefix + ".d4." + number ; - d4file2 = Prefix + ".D4." + number ; - d5file = Prefix + ".d5." + number ; - alignfile = Prefix + ".A3." + number ; - test_alignfile = Prefix + ".tst.A3." + number ; - p0file = Prefix + ".p0_3." + number ; - } - // clear count tables - // tCountTable.clear(); - dCountTable.clear(); - aCountTable.clear(); - initAL(); - nCountTable.clear(); - d4m.clear(); - p0_count = p1_count = 0 ; - -#ifdef TRICKY_IBM3_TRAINING - -#define TRAIN_ARGS perp, trainViterbiPerp, sHandler1, dump_files, alignfile.c_str(), true, modelName,final -#define TEST_ARGS *testPerp, *testViterbiPerp, *testHandler, dump_files, test_alignfile.c_str(),false, modelName,final - - - switch( toModel ) - { - case '3': - switch(fromModel ) - { - case 'H': - viterbi_loop_with_tricks (TRAIN_ARGS,h,(void*)0); - if (testPerp && testHandler) - viterbi_loop_with_tricks(TEST_ARGS, h,(void*)0); - break; - case '3': - viterbi_loop_with_tricks( TRAIN_ARGS, (void*)0,(void*)0); - if (testPerp && testHandler) - viterbi_loop_with_tricks( TEST_ARGS, (void*)0,(void*)0); - break; - default: abort(); - } - break; - case '4': - { - switch(fromModel) - { - case 'H': - viterbi_loop_with_tricks (TRAIN_ARGS,h,&d4m); - if (testPerp && testHandler) - viterbi_loop_with_tricks(TEST_ARGS, h,&d4m); - break; - case '3': - viterbi_loop_with_tricks(TRAIN_ARGS, (void*)0,&d4m); - if (testPerp && testHandler) - viterbi_loop_with_tricks( TEST_ARGS , (void*)0,&d4m); - break; - case '4': - viterbi_loop_with_tricks(TRAIN_ARGS , &d4m,&d4m); - if (testPerp && testHandler) - viterbi_loop_with_tricks( TEST_ARGS, &d4m,&d4m); - break; - default: abort(); - } - d4m.normalizeTable(); - if( dump_files ) - d4m.printProbTable(d4file.c_str(),d4file2.c_str()); - } - break; - case '5': - { - switch(fromModel) - { - case 'H': - viterbi_loop_with_tricks (TRAIN_ARGS,h,&d5m); - if (testPerp && testHandler) - viterbi_loop_with_tricks(TEST_ARGS, h,&d5m); - break; - case '3': - viterbi_loop_with_tricks(TRAIN_ARGS, (void*)0,&d5m); - if (testPerp && testHandler) - viterbi_loop_with_tricks( TEST_ARGS , (void*)0,&d5m); - break; - case '4': - viterbi_loop_with_tricks(TRAIN_ARGS, &d4m,&d5m); - if (testPerp && testHandler) - viterbi_loop_with_tricks( TEST_ARGS, &d4m,&d5m); - break; - case '5': - viterbi_loop_with_tricks(TRAIN_ARGS, &d5m,&d5m); - if (testPerp && testHandler) - viterbi_loop_with_tricks( TEST_ARGS, &d5m,&d5m); - break; - default: abort(); - } - d5m.d4m.normalizeTable(); - if( dump_files ) - d5m.d4m.printProbTable(d4file.c_str(),d4file2.c_str()); - d5m.normalizeTable(); - if( dump_files ) - { - ofstream d5output(d5file.c_str()); - d5output << d5m; - } - } - break; - default: abort(); - } - -#else - viterbi_loop(perp, trainViterbiPerp, sHandler1, dump_files, - alignfile.c_str(), true, model); - if (testPerp && testHandler) - viterbi_loop(*testPerp, *testViterbiPerp, *testHandler, - dump_files, test_alignfile.c_str(), false, model); - -#endif - if( errorsAL() -#include -#include -#include -#include -#include -#include "Vector.h" -#include - - -#include -#include -#include -#include "MoveSwapMatrix.h" -#include "TTables.h" -#include "ATables.h" -#include "NTables.h" -#include "getSentence.h" -#include "defs.h" -#include "model2.h" -#include "Perplexity.h" -#include "transpair_model3.h" -#include "transpair_modelhmm.h" -#include "alignment.h" -#include "vocab.h" -#include "D4Tables.h" -#include "AlignTables.h" - -class model3 : public model2 -{ -public: - amodel dTable; - amodel dCountTable; - - PROB p0,p1; - double p0_count, p1_count ; - - nmodel nTable; - nmodel nCountTable; - hmm*h; - -public: - void setHMM(hmm*_h){h=_h;} - model3(model2& m2); - ~model3(); - // methods - void transfer(sentenceHandler&, bool, Perplexity&, Perplexity&,bool updateT=1); - void transferSimple(sentenceHandler&, bool, Perplexity&, Perplexity&,bool updateT=1); - void load_tables(const char *nfile, const char *dfile, const char *p0file); - - void em(int, sentenceHandler&); - int viterbi(int, int, int,int); - -private: - LogProb prob_of_special(Vector&, - Vector&, - tmodel&, - Vector&, - Vector&); - - LogProb prob_of_target_and_alignment_given_source(Vector&, - Vector&, - tmodel&, - Vector&, - Vector&); - LogProb prob_of_target_given_source(tmodel&, - Vector&, - Vector&); - - LogProb scoreOfMove(Vector&, Vector&, - Vector&, Vector&, - tmodel&, WordIndex, WordIndex); - - LogProb scoreOfSwap(Vector&, Vector&, - Vector&, tmodel&, int, int); - - void hillClimb(Vector&, Vector&, - Vector&, Vector&, - LogProb&, tmodel&, int, int); - - void findBestAlignment(Vector&, Vector&, - Vector&, Vector&, - LogProb&,int , int); - - - void findAlignmentsNeighborhood( Vector&, - Vector&, - LogProb&align_total_count, - alignmodel&neighborhood, - int, int); - void collectCountsOverAlignement(const Vector& es, - const Vector& fs, - const Vector&, - LogProb , float count); - LogProb viterbi_model2(const transpair_model3&ef, alignment&output, int pair_no,int i_peg = -1 , int j_peg = -1 )const; - LogProb _viterbi_model2(const transpair_model2&ef, alignment&output, int i_peg = -1 , int j_peg = -1 )const; - LogProb viterbi_model2(const transpair_modelhmm&ef, alignment&output, int pair_no,int i_peg = -1 , int j_peg = -1 )const; - - private: - void estimate_t_a_d(sentenceHandler& sHandler1, Perplexity& perp1, Perplexity& perp2,bool simple, bool dump_files,bool updateT); - void viterbi_loop(Perplexity&, Perplexity&, sentenceHandler&, bool, const char*,bool,string model); - - template - void viterbi_loop_with_tricks(Perplexity&, Perplexity&, sentenceHandler&, - bool, const char*, bool, string model, bool final,A*d4m,B*d5m); - -}; - -#endif diff --git a/ext/giza-pp/GIZA++-v2/model345-peg.cpp b/ext/giza-pp/GIZA++-v2/model345-peg.cpp deleted file mode 100644 index 8c1bde6c..00000000 --- a/ext/giza-pp/GIZA++-v2/model345-peg.cpp +++ /dev/null @@ -1,191 +0,0 @@ -/* - -Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "model3.h" -#include "collCounts.h" - -bool makeOneMoveSwap(const alignment&x,const alignment&y,set&soms) -{ - OneMoveSwap oms; - oms.type=0; - int count=0; - Vector positions(4); - assert(x.get_m()==y.get_m()); - for(PositionIndex j=1;j<=x.get_m();j++) - if(x(j)!=y(j)) - { - if(count==4) - return 0; - positions[count]=j; - count++; - } - assert(count>0); - if(count==1) - { - oms.type=1; - oms.a=positions[0]; - oms.b=y(positions[0]); - soms.insert(oms); - for(unsigned int j=1;j<=x.get_m();++j) - { - if( int(j)!=positions[0]&&y(j)==y(positions[0])) - { - oms.type=3; - oms.a=j; - oms.b=x(positions[0]); - soms.insert(oms); - } - } - for(unsigned int j=1;j<=x.get_m();++j) - { - if( int(j)!=positions[0]&&x(j)==x(positions[0])) - { - oms.type=2; - oms.a=positions[0]; - oms.b=j; - if( oms.b xx(3),yy(3); - xx[0]=x(positions[0]);xx[1]=x(positions[1]);xx[2]=x(positions[2]); - yy[0]=y(positions[0]);yy[1]=y(positions[1]);yy[2]=y(positions[2]); - sort(xx.begin(),xx.end()); - sort(yy.begin(),yy.end()); - if(xx==yy) - { - oms.type=2;oms.a=positions[0];oms.b=positions[1];soms.insert(oms); - oms.type=2;oms.a=positions[0];oms.b=positions[2];soms.insert(oms); - oms.type=2;oms.a=positions[1];oms.b=positions[2];soms.insert(oms); - } - else - { - //cout << "HERE.\n"; - if( x(positions[0])==y(positions[1])&&x(positions[1])==y(positions[0]) ) - { - oms.type=2;oms.a=positions[0];oms.b=positions[1]; - if( oms.b xx(4),yy(4); - for(int i=0;i<4;++i) - { - xx[i]=x(positions[i]); - yy[i]=y(positions[i]); - } - sort(xx.begin(),xx.end()); - sort(yy.begin(),yy.end()); - if(xx==yy) - { - oms.type=2; - for(int j1=0;j1<4;j1++) - for(int j2=j1+1;j2<4;j2++) - { - if(x(positions[j1])!=x(positions[j2])&& - x(positions[j1])==y(positions[j2])&& - x(positions[j2])==y(positions[j1])) - { - oms.type=2;oms.a=positions[j1];oms.b=positions[j2]; - soms.insert(oms); - } - } - } - return 1; - } - else - return 0; -} diff --git a/ext/giza-pp/GIZA++-v2/model3_viterbi.cpp b/ext/giza-pp/GIZA++-v2/model3_viterbi.cpp deleted file mode 100644 index bf1e7ab6..00000000 --- a/ext/giza-pp/GIZA++-v2/model3_viterbi.cpp +++ /dev/null @@ -1,656 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "model3.h" -#include "utility.h" -#include "Globals.h" - - -LogProb model3::prob_of_target_and_alignment_given_source(Vector& A, - Vector& Fert, - tmodel& tTable, - Vector& fs, - Vector& es) -{ - LogProb total = 1.0 ; - LogProb temp = 0.0 ; - const LogProb zero = 0.0 ; - WordIndex l = es.size()-1, m = fs.size()-1; - WordIndex i, j ; - - total *= pow(double(1-p1), m-2.0 * Fert[0]) * pow(double(p1), double(Fert[0])); - if (total == 0) - return(zero); - for (i = 1 ; i <= Fert[0] ; i++){ // loop caculates m-fert[0] choose fert[0] - total *= double(m - Fert[0] - i + 1) / i ; - if (total == 0) - return(zero); - } - for (i = 1 ; i <= l ; i++){ // this loop calculates fertilities term - total *= double(nTable.getValue(es[i], Fert[i])) * (LogProb) factorial(Fert[i]); - if (total == 0) - return(zero); - } - for (j = 1 ; j <= m ; j++){ - // temp = tTable.getValue(es[A[j]], fs[j]) ; - temp = double(tTable.getProb(es[A[j]], fs[j])) ; - total *= temp ; - if (0 != A[j]) - total *= double(dTable.getValue(j, A[j], l, m)); - if (total == 0) - return(zero); - } - return(total); -} - -LogProb model3::prob_of_target_given_source(tmodel& tTable, - Vector& fs, - Vector& es) -{ - - WordIndex x, y ; - LogProb total = 0 ; - // WordIndex l = es.size(), m = fs.size(); - WordIndex l = es.size()-1, m = fs.size()-1; - Vector A(fs.size(),/*-1*/0); - Vector Fert(es.size(),0); - WordIndex i,j ; - - for ( x = 0 ; x < pow(l+1.0, double(m)) ; x++){ // For all possible alignmets A - y = x ; - // for (j = 1 ; j < m ; j++){ - for (j = 1 ; j <= m ; j++){ - A[j] = y % (l+1) ; - y /= (l+1) ; - } - // for(i = 0 ; i < l ; i++) - for(i = 0 ; i <= l ; i++) - Fert[i] = 0 ; - // for (j = 1 ; j < m ; j++) - for (j = 1 ; j <= m ; j++) - Fert[A[j]]++; - // if (2 * Fert[0] < m){ - if (2 * Fert[0] <= m){ /* consider alignments that has Fert[0] less than - half the length of french sentence */ - total += prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es); - } - } - return(total); -} - - -LogProb model3::scoreOfMove(Vector& es, - Vector& fs, - Vector& A, - Vector& Fert, - tmodel& tTable, - WordIndex j, - WordIndex i) - // returns the scaling factor of the original score if A[j] is linked to - // i, no change is really made to A - // but the score is calculated if the move is to be taken (i.e. - // no side effects on Alignment A nor its Fertility Fert - // If the value of the scaling factor is: - // 1: then the score of the new alignment if the move is taken will - // not change. - // 0.5: the new score is half the score of the original alignment. - // 2.0: the new score will be twice as much. - // -{ - // LogProb score; - LogProb change ; - WordIndex m, l ; - - m = fs.size() - 1; - l = es.size() - 1; - - - if (A[j] == i) - // return(original_score); - return(1) ; - else if (A[j] == 0){ // a move from position zero to something else - change = double(p0*p0)/p1 * - (double((Fert[0]*(m-Fert[0]+1))) / ((m-2*Fert[0]+1)*(m-2*Fert[0]+2))) * - (Fert[i]+1) * - double(nTable.getValue(es[i], Fert[i]+1)) / - double(nTable.getValue(es[i], Fert[i])) * - double(tTable.getProb(es[i], fs[j])) / - double(tTable.getProb(es[A[j]], fs[j])) * - double(dTable.getValue(j, i, l, m)); - } - else if (i == 0){ // a move to position zero - change= - ((double(p1) / (p0*p0)) * - (double((m-2*Fert[0])*(m-2*Fert[0]-1))/((Fert[0]+1)*(m-Fert[0]))) * - (double(1)/Fert[A[j]]) * - double(nTable.getValue(es[A[j]], Fert[A[j]]-1)) / - double(nTable.getValue(es[A[j]], Fert[A[j]]))* - double(tTable.getProb(es[i], fs[j])) / - double(tTable.getProb(es[A[j]], fs[j])) * - 1.0 / double(dTable.getValue(j, A[j], l, m))); - } - else{ // a move that does not involve position zero - change = - ((double(Fert[i]+1)/Fert[A[j]]) * - double(nTable.getValue(es[A[j]], Fert[A[j]]-1)) / - double(nTable.getValue(es[A[j]], Fert[A[j]])) * - double(nTable.getValue(es[i], Fert[i]+1)) / - double(nTable.getValue(es[i], Fert[i])) * - double(tTable.getProb(es[i], fs[j]))/ - double(tTable.getProb(es[A[j]], fs[j])) * - double(dTable.getValue(j, i, l, m))/ - double(dTable.getValue(j, A[j], l, m))); - } - return(change); -} - - -LogProb model3::scoreOfSwap(Vector& es, - Vector& fs, - Vector& A, - tmodel& tTable, - int j1, - int j2) - // returns the scaling factor of the original score if the swap to - // take place, - // No side effects here (none of the parameters passed is changed! - // (i.e. the alignment A is not really changed) - // If the value of the scaling factor is: - // 1: then the score of the new alignment if the move is taken will - // not change. - // 0.5: the new score is half the score of the original alignment. - // 2.0: the new score will be twice as much. - // -{ - LogProb score ; - WordIndex i1, i2, m, l ; - - m = fs.size() - 1 ; - l = es.size() - 1 ; - if (j1 == j2 || A[j1] == A[j2]) // if swapping same position return ratio 1 - return(1); - else { - i1 = A[j1] ; - i2 = A[j2] ; - score = - double(tTable.getProb(es[i2], fs[j1]))/double(tTable.getProb(es[i1], fs[j1])) * - double(tTable.getProb(es[i1], fs[j2]))/double(tTable.getProb(es[i2], fs[j2])); - if (i1 != 0){ - score *= double(dTable.getValue(j2, i1, l, m))/double(dTable.getValue(j1, i1, l, m)); - } - if (i2 != 0){ - score *= double(dTable.getValue(j1, i2, l, m))/double(dTable.getValue(j2, i2, l, m)); - } - return(score); - } -} - - - -void model3::hillClimb(Vector& es, - Vector& fs, - Vector& A, - Vector& Fert, - LogProb& best_score, - tmodel& tTable, - int = -1, - int j_peg = -1) - // Hill climbing given alignment A . - // Alignment A will be updated and also best_score - // if no pegging is needed i_peg == -1, and j_peg == -1 -{ - WordIndex i, j, l, m, j1, old_i; - LogProb change ; - bool local_minima; - int level = 0 ; - LogProb best_change_so_far, best_change ; - Vector A_so_far; - Vector Fert_so_far; - - l = es.size() - 1; - m = fs.size() - 1; - if (Log) - logmsg << "\nStarting hill climbing with original score: " << best_score <<"\n"; - best_change = 1 ; // overall scaling factor (i.e. from the begining of climb - do { - best_change_so_far = 1 ; // best scaling factor of this level of hill climb - local_minima = true ; - for (j = 1 ; j <= m ; j++){ - if (int(j) != j_peg){ // make sure not to change the pegged link - for (j1 = j + 1 ; j1 <= m; j1++){ - // for all possible swaps - // make sure you are not swapping at same position - if ((A[j] != A[j1]) && (int(j1) != j_peg)){ - // change = scoreOfSwap(es, fs, A, best_score, tTable, j, j1); - change = scoreOfSwap(es, fs, A, tTable, j, j1); - if (change > best_change_so_far){ // if better alignment found, keep it - local_minima = false ; - best_change_so_far = change ; - A_so_far = A ; - Fert_so_far = Fert ; - old_i = A_so_far[j] ; - A_so_far[j] = A_so_far[j1] ; - A_so_far[j1] = old_i ; - } // end of if (change > best_change_so_far) - } // end of if (A[j] != A[j1] ..) - } // of for (j1 = j+1 ....) - // for (i = 0 ; i < l ; i++){ // all possible moves - for (i = 0 ; i <= l ; i++){ // all possible moves - if (i != A[j]){ // make sure not to move to same position - if (i != 0 || (m >= 2 * (Fert[0]+1))){ // if moving to NULL word - // (pos 0), make sure not to violate the fertility restriction - // i.e. NULL can not take more than half the target words - // change = scoreOfMove(es, fs, A, Fert, best_score, tTable, j, i); - change = scoreOfMove(es, fs, A, Fert, tTable, j, i); - if (change > best_change_so_far){ // if better alignment found, keep it - best_change_so_far = change ; - local_minima = false ; - A_so_far = A ; - Fert_so_far = Fert ; - old_i = A_so_far[j] ; - A_so_far[j] = i ; - Fert_so_far[old_i]-- ; - Fert_so_far[i]++ ; - } // end of if (change > best_change_so_far) - } // end of if ((i!=0) ... - } // end of if (i != A[j] ) - } // end of for (i = 0 ; ....) - } // end of if(j != j_peg) - } // end of for (j = 1 ; ...) - level++; - if (!local_minima){ - if (best_change_so_far > 1){ // if current chage is improving - A = A_so_far ; - Fert = Fert_so_far ; - best_change *= best_change_so_far ; - } - else{ - local_minima = true ; - } - } // end of if(!local_minima) - if (Log) - logmsg << "." ; - if (level> 15) - cerr << "." ; - } while (local_minima == false); - if (Log) - logmsg << "\n" << "Hill Climb Level: " << level << " score: scaling old: " <<(best_score*best_change) ; - if (level > 15) - cerr << "\nHill Climb Level: " << level << " score: scaling old: " <<(best_score*best_change) ; - best_score = prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es); - if (Log) - logmsg << " using new calc: " << best_score << '\n'; - if (level>15) - cerr << " using new calc: " << best_score << '\n'; -} - - -void model3::findBestAlignment(Vector& es, - Vector& fs, - Vector& A, - Vector& Fert, - LogProb& best_score, - /*tmodel& tTable, - amodel& aTable, */ - int i_peg = -1 , - int j_peg = -1 ) - // This finds the best Model2 alignment (i.e. no fertilities stuff) in A - // for the given sentence pair. Its score is returned in A. Its fertility - // info in Fert. - // if j_peg == -1 && i_peg == -1 then No pegging is performed. -{ - WordIndex i, j, l, m, best_i=0; - LogProb temp, score, ss; - - l = es.size() - 1; - m = fs.size() - 1; - for (i=0 ; i <= l ; i++) - Fert[i] = 0 ; - ss = 1 ; - if ((j_peg != -1) && (i_peg != -1)){ // if you're doing pegging - A[j_peg] = i_peg ; - Fert[i_peg] = 1 ; - ss *= double(tTable.getProb(es[i_peg], fs[j_peg])) * - double(aTable.getValue(i_peg, j_peg, l, m)); - } - for (j = 1 ; j <= m ; j++){ - if (int(j) != j_peg){ - score = 0 ; - for (i = 0 ; i <= l ; i++){ - // first make sure that connecting target word at pos j to source word - // at pos i will not lead to a violation on Fertility restrictions - // (e.g. maximum fertility for a word, max fertility for NULL word, etc) - if ((Fert[i]+1 < MAX_FERTILITY) && ((i == 0 && (m >= 2*(Fert[0]+1))) - || (i != 0))){ - temp = double(tTable.getProb(es[i], fs[j])) * - double(aTable.getValue(i, j, l, m)); - if (temp > score ){ - best_i = i ; - score = temp ; - } // end of if (temp > score) - } // end of if (((i == 0 ...) - } // end of for (i= 0 ...) - if (score == 0){ - cerr << "WARNING: In searching for model2 best alignment\n " ; - cerr << "Nothing was set for target token " << fs[j] << - "at position j: " << j << "\n"; - for (i = 0 ; i <= l ; i++){ - cerr << "i: " << i << "ttable("<= 2*(Fert[0]+1))) - || (i != 0))) - cerr <<"Passed fertility condition \n"; - else - cerr <<"Failed fertility condition \n"; - } - - } // end of if (score == 0) - else { - Fert[best_i]++ ; - A[j] = best_i ; - } - ss *= score ; - } // end of if (j != j_peg) - } // end of for (j == 1 ; ...) - if (ss <= 0){ - cerr << "WARNING: Model2 viterbi alignment has zero score for sentence pair:\n" ; - printSentencePair(es, fs, cerr); - } - best_score = prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es); - if (Log) - logmsg << "finding best alignment : score : " << ss <<"p(f, a/e) = "<< best_score<<"\n"; -} - -void model3::collectCountsOverAlignement(const Vector& es, - const Vector& fs, - const Vector& A, - LogProb score, - float count) -{ - WordIndex j,i,l,m ; - Vector Fert(es.size(),0); - l = es.size() - 1 ; - m = fs.size() - 1 ; - score *= LogProb(count); - COUNT temp = COUNT(score) ; - for (i=0 ; i <= l ; i++) - Fert[i] = 0 ; - for (j = 1 ; j <= m ; j++){ - Fert[A[j]]++; - tTable.incCount(es[A[j]], fs[j], temp); - // tCountTable.getRef(es[A[j]], fs[j])+=score; - if (A[j]) - dCountTable.getRef(j, A[j], l, m)+= temp ; - aCountTable.getRef(A[j], j, l, m)+= temp ; - } - for(i = 0 ; i <= l ; i++) - nCountTable.getRef(es[i], Fert[i])+= temp ; - // p1_count += score * (LogProb) (Fert[0]) ; - // p0_count += score * (LogProb) ((m - 2 * Fert[0])) ; - p1_count += temp * (Fert[0]) ; - p0_count += temp * ((m - 2 * Fert[0])) ; -} - - - -void model3::findAlignmentsNeighborhood(Vector& es, - Vector& fs, - LogProb&align_total_count, - alignmodel&neighborhood, - int i_peg = -1, - int j_peg = -1 - ) - // Finding the Neigborhood of a best viterbi alignment after hill climbing - // if (i_peg == -1 and j_peg == -1, then No Pegging is done. -{ - LogProb best_score,score; - WordIndex i,j,l,m,old_i,j1; - Vector A(fs.size(),0); - Vector Fert(es.size(),0); - time_t it_st; - - best_score = 0 ; - l = es.size() - 1; - m = fs.size() - 1; - findBestAlignment(es, fs, A, Fert, best_score, /*tTable, aTable,*/ i_peg, j_peg); - if (best_score == 0){ - cerr << "WARNING: viterbi alignment score is zero for the following pair\n"; - printSentencePair(es, fs, cerr); - } - hillClimb(es, fs, A, Fert, best_score, tTable, i_peg, j_peg); - if (best_score <= 0){ - cerr << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; - printSentencePair(es, fs, cerr); - if(Log){ - logmsg << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; - printSentencePair(es, fs, logmsg); - } - } - else { // best_score > 0 - // if (2 * Fert[0] < m ){ - if (2*Fert[0] <= m ){ - /* consider alignments that has Fert[0] less than - half the number of words in French sentence */ - if (neighborhood.insert(A, best_score)){ - align_total_count += best_score ; - } - } - else { // else part is added for debugging / Yaser - cerr << "WARNING:Best Alignment found violates Fertility requiremnets !!\n" ; - for (i = 0 ; i <= l ; i++) - cerr << "Fert["< 0){ - /* consider alignments that has Fert[0] less than - half the number of words in French sentence */ - old_i = A[j] ; - A[j] = A[j1] ; - A[j1] = old_i ; - if (neighborhood.insert(A, score)){ - align_total_count += score ; - } - // restore original alignment - old_i = A[j] ; - A[j] = A[j1] ; - A[j1] = old_i ; - } - } - } - for (i = 0 ; i <= l ; i++){ // all possible moves - if (i != A[j]){ // make sure not to move to same position - if ((Fert[i]+1 < MAX_FERTILITY) && - ((i == 0 && (m >= 2*(Fert[0]+1))) || (i != 0))){ - // consider legal alignments only - score = best_score * scoreOfMove(es, fs, A, Fert, tTable, j, i); - // ADD A and its score to list of alig. to collect counts over - if (score > 0){ - old_i = A[j] ; - A[j] = i ; - Fert[old_i]-- ; - Fert[i]++ ; - // add to list of alignemts here ****************** - if (neighborhood.insert(A, score)){ - align_total_count += score ; - } - // now resotre alignment and fertilities to previoud values - A[j] = old_i ; - Fert[old_i]++ ; - Fert[i]-- ; - } // end of if (score > 0) - } // end of if (i == 0 ...) - } // end of if (i != A[j]) - }// end of for(i = 0 ; ...) - }// end of for (j = 1 ; ...) - } // of else best_score <= 0 -} - -void model3::viterbi_loop(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, - bool dump_files, const char* alignfile, - bool collect_counts, string model ) -{ - WordIndex i, j, l, m ; - ofstream of2 ; - int pair_no; - LogProb temp; - - if (dump_files) - of2.open(alignfile); - pair_no = 0 ; // sentence pair number - // for each sentence pair in the corpus - perp.clear() ; // clears cross_entrop & perplexity - viterbiPerp.clear(); - sentPair sent ; - while(sHandler1.getNextSentence(sent)){ - Vector& es = sent.eSent; - Vector& fs = sent.fSent; - const float count = sent.getCount(); - if ((sent.sentenceNo % 1000) == 0) - cerr < viterbi_alignment; - LogProb viterbi_score ; - alignmodel neighborhood; - neighborhood.clear(); - align_total_count = 0; - findAlignmentsNeighborhood(/*tTable, aTable,*/ /*p1_count, p0_count,*/ es, fs, align_total_count, neighborhood) ; - if (Peg){ - for (i = 0 ; i <= l ; i++) - for (j = 1 ; j <= m ; j++){ - if ( (tTable.getProb(es[i], fs[j]) > PROB_SMOOTH) && - (aTable.getValue(i, j, l, m) > PROB_SMOOTH) && - (dTable.getValue(j, i, l, m) > PROB_SMOOTH)) - findAlignmentsNeighborhood(/*tTable, aTable,*/ /*p1_count, - p0_count, */ es, fs, align_total_count, neighborhood, i, j); - } - } - // Now Collect counts over saved neighborhoods - viterbi_score = 0 ; - if (Verbose) - cerr << "\nCollecting counts over found alignments, total prob: " - << align_total_count << "\n"; - if (Log) - logmsg << "\nCollecting counts over found alignments, total prob: " - << align_total_count << "\n"; - hash_map, LogProb, hashmyalignment, equal_to_myalignment >::iterator align ; - int acount = 0 ; - if (align_total_count == 0 ){ - cerr << " WARNINIG: For the following sentence pair : \n"; - printSentencePair(es, fs, cerr); - cerr << "The collection of alignments found have 0 probability!!\n"; - cerr << "No counts will be collected of it \n"; - if (Log){ - logmsg << "The collection of alignments found have 0 probability!!\n"; - logmsg << "No counts will be collected of it \n"; - } - } - else { - if (collect_counts) { - for(align = neighborhood.begin(); align != neighborhood.end(); align++){ - temp = (*align).second/align_total_count ; - collectCountsOverAlignement(/*tTable, aCountTable, */es, fs, /*p1_count, - p0_count ,*/ ((*align).first), temp , count); - acount++; - if (viterbi_score < temp){ - viterbi_alignment = ((*align).first); - viterbi_score = temp; - } - } - } // end of if (collect_counts) - perp.addFactor(log(double(align_total_count)), count, l, m,0); - viterbiPerp.addFactor(log(double(viterbi_score)), count, l, m,0); - - if (Verbose){ - cerr << "Collected counts over "< > viterbis; - Vectorvit; - int m=ef.get_m(); - int l=ef.get_l(); - double ret=0.0; - //#define STORE_HMM_ALIGNMENTS -#ifdef STORE_HMM_ALIGNMENTS - if( i_peg==-1 && j_peg==-1 && viterbis.size()>pair_no ) - { - output=viterbis[pair_no].first; - ret=viterbis[pair_no].second; - massert( ret==HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply ); - } - else - { - ret=HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply; - for(int j=1;j<=m;j++) - { - if( vit[j-1]+1>l ) - output.set(j,0); - else - output.set(j,vit[j-1]+1); - massert( (j==j_peg&&int(output(j))==i_peg) || j_peg!=j); - } - if( i_peg==-1 && j_peg==-1 ) - { - iassert(viterbis.size()==pair_no); - viterbis.push_back(make_pair(output,ret)); - } - } -#else - ret=HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply; - for(int j=1;j<=m;j++) - { - if( vit[j-1]+1>l ) - output.set(j,0); - else - output.set(j,vit[j-1]+1); - massert( (j==j_peg&&int(output(j))==i_peg) || j_peg!=j); - } -#endif - massert( j_peg==-1 || int(output(j_peg))==i_peg ); - if( j_peg!=-1 ) - massert(int(output(j_peg))==i_peg); - if( output.valid() ) - return ret; - else - { - return _viterbi_model2(ef,output,i_peg,j_peg); - } -} - -LogProb model3::_viterbi_model2(const transpair_model2&ef, alignment&output, int i_peg, int j_peg)const -{ - WordIndex best_i=0; - LogProb ss=1; - PositionIndex l = ef.get_l(), m=ef.get_m(); - Vector Fert(l+1, (WordIndex)0); - if ((j_peg != -1) && (i_peg != -1)) - { - output.set(j_peg, i_peg); - ss *= ef.get_t(i_peg, j_peg) * ef.get_a(i_peg, j_peg); - if( ss==0 ) - cerr << "WARNING: already starting is zero: " << ef.get_t(i_peg, j_peg) << " " << ef.get_a(i_peg, j_peg) << '\n'; - } - else - ss=1; - for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg) - { - LogProb score = 0 ; - for (PositionIndex i = 0 ; i <= l ; i++) - { - if( Fert[i]+1=(2 * (Fert[0] + 1)))) - { - LogProb temp = ef.get_t(i, j) * ef.get_a(i, j); - if (temp > score ) - { - best_i = i ; - score = temp ; - } - } - } - if (score == 0){ - cerr << "WARNING: In searching for model2 best alignment\n"; - cerr << "Nothing was set for target token at position j: " << j << "\n"; - for (PositionIndex i = 0 ; i <= l ; i++){ - cerr << "i: " << i << "ttable("<= 2*(Fert[0]+1))) - || (i != 0))) - cerr <<"Passed fertility condition \n"; - else - cerr <<"Failed fertility condition \n"; - } - } - else - { - output.set(j, best_i); - Fert[best_i]++; - } - ss *= score; - } - if (ss <= 0){ - //cerr << ef; - cerr << "WARNING: Model2 viterbi alignment has zero score.\n" ; - cerr << "Here are the different elements that made this alignment probability zero \n"; - cerr << "Source length " << l << " target length " << m << '\n'; - LogProb gg=1 ; // for debugging only ..... - for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg){ - LogProb score = 0 ; - LogProb a = 0, t =0 ; - for (PositionIndex i = 0 ; i <= l ; i++){ - // if( Debug_Fert[i]+1=(2 * (Debug_Fert[0] + 1)))){ - LogProb temp = ef.get_t(i, j) * ef.get_a(i, j); - if (temp > score ){ - score = temp ; - best_i = i ; - a = ef.get_a(i, j); - t = ef.get_t(i, j) ; - } - // } - } - gg *= score ; - cerr << "best: fs[" << j << "] "<< j <<" : es[" << best_i << "] " << - best_i << " , a: " << ef.get_a(best_i, j) << " t: " << t << " score " << score << " product : " << gg << " ss " << - ss << '\n'; - } - for(PositionIndex i = 0 ; i <= l ; i++) - cerr << "Fert["< -LogProb greedyClimb_WithIBM3Scoring(MoveSwapMatrix&msc2,int j_peg=-1) -{ - PositionIndex l = msc2.get_l(), m=msc2.get_m(); - int changed=0; - int iter=0; - bool hereVERB=0; - do - { - MoveSwapMatrix msc_IBM3(msc2.get_ef(),alignment(msc2)); - vector > msvec; - for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg) - { - WordIndex aj=msc2(j); - for (PositionIndex j1 = j + 1 ; j1 <= m; j1++) - if((aj != msc2(j1)) && (int(j1) != j_peg)) - msvec.push_back(pair(-msc_IBM3.cswap(j,j1),OneMoveSwap(1,j,j1))); - for (PositionIndex i = 0 ; i <= l ; i++) - if(i != aj &&(i != 0 || (m >= 2 * (msc2.fert(0)+1))) && msc2.fert(i)+1(-msc_IBM3.cmove(i,j),OneMoveSwap(2,i,j))); - } - sort(msvec.begin(),msvec.end()); - HillClimbingSteps++; - int iused=-1; - changed=0; - for(unsigned int i=0;i1.0001 ) - { - if( hereVERB==1 ) - cerr << "SWAP: " << csts << '\n'; - msc2.doSwap(oms.a,oms.b); - changed=1; - iused=i; - break; - } - if( oms.type==2&&(csts=msc2.cmove(oms.a,oms.b))>1.0001 ) - { - if( hereVERB==1 ) - cerr << "MOVE: " << csts << '\n'; - msc2.doMove(oms.a,oms.b); - changed=1; - iused=i; - break; - } - } - if( ++iter>30 ) - { - //msc2.ef.verboseTP=1; - hereVERB=1; - cerr << "ERROR: more than 30 iterations in hill-climbing: " << iused - << " improvement: " << msvec[iused].first << " value:" << msvec[iused].second - << '\n' << msc2 << '\n'; - for(int a=0;a<20;++a) - cout << a << ' ' << msvec[a].first << ' ' << msvec[a].second << '\n'; - //cerr << msvec << '\n'; - } - if( iter>50 ) - break; - } while(changed); - return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2); -} - -template -LogProb greedyClimb(MoveSwapMatrix&msc2, int j_peg = -1) -{ - if( msc2.get_ef().greedyHillClimbing()==1 ) - return greedyClimb_WithIBM3Scoring(msc2,j_peg); - PositionIndex l = msc2.get_l(), m=msc2.get_m(); - int changed=0; - do - { - HillClimbingSteps++; - changed=0; - for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg) - { - WordIndex aj=msc2(j); - for (PositionIndex j1 = j + 1 ; j1 <= m; j1++)if((aj != msc2(j1)) && (int(j1) != j_peg)&&msc2.cswap(j, j1) > 1.0) - msc2.doSwap(j, j1), changed=1; - for (PositionIndex i = 0 ; i <= l ; i++)if(i != aj &&(i != 0 || (m >= 2 * (msc2.fert(0)+1))) && msc2.fert(i)+11.0) - msc2.doMove(i, j), changed=1; - } - } while (changed); - return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2); -} - -template -LogProb hillClimb_std(MoveSwapMatrix&msc2, int= -1,int j_peg = -1) -{ - if( msc2.isLazy() ) - return greedyClimb_WithIBM3Scoring(msc2,j_peg); - if( LogHillClimb>1 ) - cout << msc2 << '\n'; - PositionIndex l = msc2.get_l(), m=msc2.get_m(); - int changes=0; - int best_change_type=-1, best_change_v1=-1, best_change_v2=-1; - do - { - HillClimbingSteps++; - LogProb best_change_so_far = 1.00001 ; - best_change_type=0; - for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg) - { - WordIndex aj=msc2(j); - for (PositionIndex j1 = j + 1 ; j1 <= m; j1++)if((aj != msc2(j1)) && (int(j1) != j_peg)) - { - LogProb change = msc2.cswap(j, j1); - if (change > best_change_so_far) - { - best_change_so_far = change ; - best_change_type=1; - best_change_v1=j; - best_change_v2=j1; - if( LogHillClimb ) - cerr << "CLIMB: " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << " " << best_change_so_far << msc2 << '\n'; - massert(msc2.get_ef().isSubOptimal()==1); - } - } - for (PositionIndex i = 0 ; i <= l ; i++)if(i != aj &&(i != 0 || (m >= 2 * (msc2.fert(0)+1))) && msc2.fert(i)+1 best_change_so_far) - { - best_change_so_far = change ; - best_change_type=2; - best_change_v1=j; - best_change_v2=i; - if( LogHillClimb ) - cerr << "CLIMB: " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << " " << best_change_so_far << msc2 << '\n'; - massert(msc2.get_ef().isSubOptimal()==1); - } - } - } - if (best_change_type==1) - { - msc2.doSwap(best_change_v1, best_change_v2); - if( LogHillClimb ) - cerr << "SW-CLIMB-DONE: " << j_peg << msc2 << '\n'; - } - if (best_change_type==2) - { - msc2.doMove(best_change_v2, best_change_v1); - if( LogHillClimb ) - cerr << "MO-CLIMB-DONE: " << j_peg << msc2 << '\n'; - } - changes++; - if( changes>40 ) - { - if( PrintHillClimbWarning++<1000 ) - cerr << "WARNING: already " << changes << " iterations in hillclimb: " << best_change_so_far << " " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << '\n'; - else if (PrintHillClimbWarning==1000) - cerr << "ERROR: too many hill climbing warnings => I do not print more.\n"; - } - if(changes>60 ) - { - cerr << msc2 << '\n'; - break; - } - } while (best_change_type); - return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2); -} - -template -bool extendCenterList(Vector*,LogProb> >&setOfGoodCenters,MoveSwapMatrix *msc,double peggedAlignmentScore) -{ - unsigned int l=msc->get_ef().get_l(); - set alreadyCovered; - for(unsigned int nr=0;nr::const_iterator i=alreadyCovered.begin();i!=alreadyCovered.end();++i) - { - if( i->type==1||i->type==4) - msc->delCenter(); - if( i->type==1 ) - { - for(unsigned int ii=0;ii<=l;++ii) - if( (*msc)(i->a)!=ii ) - msc->delMove(ii,i->a); - } - else if( i->type==2||i->type==4 ) - msc->delSwap(i->a,i->b); - else if( i->type==3 ) - msc->delMove(i->b,i->a); - else abort(); - } - setOfGoodCenters.push_back(make_pair(msc,peggedAlignmentScore)); - return 1; -} - -bool OldLog=0; -short OldLogPeg=0,OldLogHillClimb=0; -class Als -{ -public: - int s,a,b; - double v; - Als(int _s,int _a,int _b,double _v) - : s(_s),a(_a),b(_b),v(_v) {} -}; - -inline bool operator<(const Als&x,const Als&y) -{return x.v>y.v;} - -template -void model3::viterbi_loop_with_tricks(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, - bool dump_files, const char* alignfile, - bool collect_counts, string model, bool final, - ADDITIONAL_MODEL_DATA_IN*dm_in, - ADDITIONAL_MODEL_DATA_OUT*dm_out) -{ - ofstream *writeNBestErrorsFile=0; - if( (dump_files||FEWDUMPS)&&PrintN&&ReferenceAlignment.size()>0 ) - { - string x=alignfile+string("NBEST"); - writeNBestErrorsFile= new ofstream(x.c_str()); - } - ofstream *of3=0; - PositionIndex i, j, l, m ; - ofstream of2; - int pair_no; - HillClimbingSteps=0; - NumberOfAlignmentsInSophisticatedCountCollection=0; - if (dump_files||FEWDUMPS||(final&&(ONLYALDUMPS)) ) - of2.open(alignfile); - if( dump_files&&PrintN&&final ) - { - string x=alignfile+string("NBEST"); - of3= new ofstream(x.c_str()); - } - pair_no = 0 ; // sentence pair number - // for each sentence pair in the corpus - perp.clear() ; // clears cross_entrop & perplexity - viterbiPerp.clear() ; // clears cross_entrop & perplexity - sentPair sent ; - int NCenter=0,NHillClimbed=0,NAlignment=0,NTotal=0,NBetterByPegging=0; - while(sHandler1.getNextSentence(sent)){ - if( sent.eSent.size()==1||sent.fSent.size()==1 ) - continue; - SentNr=sent.sentenceNo; - Vector& es = sent.eSent; - Vector& fs = sent.fSent; - const float count = sent.getCount(); - if ((sent.sentenceNo % 10000) == 0) - cerr <*,LogProb> >setOfGoodCenters(1); - set alignments; - MoveSwapMatrix *best = (setOfGoodCenters[0].first = new MoveSwapMatrix(ef, viterbi2alignment)); - MoveSwapMatrix _viterbi(*best), *viterbi=&_viterbi; // please, don't delete this line (FJO) - if (Log) - logmsg << "VITERBI: " << alignment(_viterbi); - if( ef.isSubOptimal() ) - setOfGoodCenters[0].second = hillClimb_std(*best); - else - { - setOfGoodCenters[0].second = best->get_ef().prob_of_target_and_alignment_given_source(*best); - if( setOfGoodCenters[0].second==0 ) - { - cerr << "PROBLEM: alignment is 0.\n"; - best->get_ef().prob_of_target_and_alignment_given_source(*best,1); - } - } - int bestAlignment=0; - - - for(unsigned int i=0;icheck(); - alignments.insert(*best); - if (setOfGoodCenters[bestAlignment].second <= 0){ - if( PrintZeroScoreWarning++<100 ) - { - cerr << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; - cerr << alignment(*setOfGoodCenters[bestAlignment].first) ; - printSentencePair(es, fs, cerr); - if(Log){ - logmsg << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; - printSentencePair(es, fs, logmsg); - } - } - else if(PrintZeroScoreWarning==100) - { - cerr << "ERROR: too many zero score warnings => no additional one will be printed\n"; - } - setOfGoodCenters[bestAlignment].second=1e-300; - continue; - } - int nHillClimbed=1,nAlignment=1; - bool flagBetterByPegging=0; - if ( Peg ) - { - const MoveSwapMatrix *useMatrix=viterbi; // it is faster using 'best', ... (FJO) - Array2 > linkCache(l+1, m+1, false); - if(UseLinkCache)for(unsigned int j=1;j<=m;j++)linkCache((*useMatrix)(j), j)=1; - for(PositionIndex j=1;j<=m;j++)for(PositionIndex i=0;i<=l;i++) - { - nAlignment++; - if( i!=(*useMatrix)(j) && (UseLinkCache==0||linkCache(i,j)==0) && - ef.get_t(i,j)>ef.get_t((*useMatrix)(j),j)*PEGGED_CUTOFF && - (i != 0 || (m >= 2 * (useMatrix->fert(0)+1)))) - { - MoveSwapMatrix *BESTPEGGED=0; - LogProb peggedAlignmentScore; - nHillClimbed++; - if( ef.isSubOptimal() ) - { - BESTPEGGED = new MoveSwapMatrix(*useMatrix); - BESTPEGGED->doMove(i, j); - peggedAlignmentScore= hillClimb_std(*BESTPEGGED, i,j); - } - else - { - alignment pegAlignment(l,m); - peggedAlignmentScore=viterbi_model2(ef,pegAlignment,pair_no-1,i,j); - BESTPEGGED = new MoveSwapMatrix(ef,pegAlignment); - massert( pegAlignment(j)==i ); - } - if(UseLinkCache) - for(unsigned int j=1;j<=m;j++) - linkCache((*BESTPEGGED)(j), j)=1; - if( peggedAlignmentScore>setOfGoodCenters[bestAlignment].second*(LogProb)PEGGED_CUTOFF && alignments.count(*BESTPEGGED)==0 ) - { - if(extendCenterList(setOfGoodCenters,BESTPEGGED,peggedAlignmentScore)) - { - alignments.insert(*BESTPEGGED); - if( peggedAlignmentScore>1.00001*setOfGoodCenters[bestAlignment].second ) - { - if( LogPeg ) - { - cerr << "found better alignment by pegging " << pair_no << " " << peggedAlignmentScore/setOfGoodCenters[bestAlignment].second << '\n'; - cerr << "NEW BEST: " << alignment(*BESTPEGGED); - cerr << "OLD : " << alignment(*setOfGoodCenters[bestAlignment].first); - } - flagBetterByPegging=1; - bestAlignment=alignments.size()-1; - } - } - assert( differences(*BESTPEGGED, *best)!=0 ); - BESTPEGGED=0; } - else - delete BESTPEGGED; - } - } - } // end of if(Peg) - NBetterByPegging+=flagBetterByPegging; - for(unsigned int i=0;icheck(); - if( LogPeg>1 ) - cout << "PEGGED: " << setOfGoodCenters.size() << " HILLCLIMBED:" << nHillClimbed << " TOTAL:" << nAlignment << " alignments." << '\n'; - int alTotal=collectCountsOverNeighborhood(setOfGoodCenters,es, fs, tTable, aCountTable, - dCountTable, nCountTable, p1_count, p0_count, - align_total_count, count, collect_counts, dm_out); - if( LogPeg>1 ) - { - cout << "ALL: " << alTotal << " from " << pow(float(l+1),float(m)) << '\n'; - massert(alTotal<=pow(double(l+1),double(m))); - } - NCenter+=setOfGoodCenters.size();NHillClimbed+=nHillClimbed;NAlignment+=nAlignment;NTotal+=alTotal; - perp.addFactor(log(double(align_total_count)), count, l, m,0); - viterbiPerp.addFactor(log(double(setOfGoodCenters[bestAlignment].second)), count, l, m,0); - massert(log(double(setOfGoodCenters[bestAlignment].second)) <= log(double(align_total_count))); - if (dump_files||(FEWDUMPS&&sent.sentenceNo<1000)||(final&&(ONLYALDUMPS)) ) - printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, (setOfGoodCenters[bestAlignment].first)->getAlignment(), pair_no, - setOfGoodCenters[bestAlignment].second); - for(unsigned int i=0;icheck(); - if( of3||(writeNBestErrorsFile&&pair_no als; - for(unsigned int s=0;s&msc= *setOfGoodCenters[s].first; - msc.check(); - double normalized_ascore=setOfGoodCenters[s].second; - if( !msc.isCenterDeleted() ) - als.push_back( Als(s,0,0,normalized_ascore) ); - - for(WordIndex j=1;j<=m;j++) - for(WordIndex i=0;i<=l;i++) - if( i!=msc(j)&& !msc.isDelMove(i,j) ) - als.push_back( Als(s,i,j,msc.cmove(i,j)*normalized_ascore)); - for(PositionIndex j1=1;j1<=m;j1++) - for(PositionIndex j2=j1+1;j2<=m;j2++) - if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) ) - als.push_back( Als(s,-j1,-j2,msc.cswap(j1,j2)*normalized_ascore)); - } - sort(als.begin(),als.end()); - double sum=0,sum2=0; - for(unsigned int i=0;i scores; - ErrorsInAlignment(ReferenceAlignment[pair_no-1],x.getAlignment(),l,ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch,pair_no); - ef.computeScores(x,scores); - *writeNBestErrorsFile << ALmissing+ALtoomuch << ' '; - for(unsigned int i=0;i0.0)?(-log(scores[i])):1.0e6) << ' '; - *writeNBestErrorsFile << '\n'; - } - } - } - if( writeNBestErrorsFile ) - *writeNBestErrorsFile << '\n'; - } - addAL((setOfGoodCenters[bestAlignment].first)->getAlignment(),sent.sentenceNo,l); - if (Log) - logmsg << "processing this sentence pair ("<(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, \ - bool dump_files, const char* alignfile,bool collect_counts, string, bool final,\ - B*d4m,C*d5m); - -INSTANTIATE(transpair_model3, void, void); -INSTANTIATE(transpair_modelhmm, const hmm, void); -INSTANTIATE(transpair_modelhmm, const hmm, d4model); -INSTANTIATE(transpair_modelhmm, const hmm, d5model); -INSTANTIATE(transpair_model3, void,d4model); -INSTANTIATE(transpair_model3, void,d5model); -INSTANTIATE(transpair_model4, d4model,d4model); -INSTANTIATE(transpair_model4, d4model,d5model); -INSTANTIATE(transpair_model5, d5model,d5model); diff --git a/ext/giza-pp/GIZA++-v2/myassert.cpp b/ext/giza-pp/GIZA++-v2/myassert.cpp deleted file mode 100644 index 2d49be82..00000000 --- a/ext/giza-pp/GIZA++-v2/myassert.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include "mystl.h" -#include -#include "myassert.h" - -#ifndef STANDARD_ASSERT -void myerror(int line,const char *file,const char *expression) -{ - cerr << "(general.h):Assertion failed: '" << expression << "' ::: b " - << file << ":" << line << endl; - cout << "(general.h):Assertion failed: '" << expression << "' ::: b " - << file << ":" << line << endl; -} -void imyerror(int line,const char *file,const char *expression) -{ - cerr << "Error: '" << expression << "' ::: in Source " << file - << ":" << line << endl; -} - -#endif - diff --git a/ext/giza-pp/GIZA++-v2/myassert.h b/ext/giza-pp/GIZA++-v2/myassert.h deleted file mode 100644 index b648fddd..00000000 --- a/ext/giza-pp/GIZA++-v2/myassert.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef MY_ASSERT_DEFINED -#define MY_ASSERT_DEFINED -void myerror(int line,const char *file,const char *expression); -void imyerror(int line,const char *file,const char *expression); - -#define iassert(expression) do {if (!(expression)) {imyerror(__LINE__,__FILE__,#expression);}} while (0) - -# -#define massert(expr) do {} while(0) - -#define vassert(expr) do {} while(0) - -#include - -#endif - - - - - diff --git a/ext/giza-pp/GIZA++-v2/mymath.h b/ext/giza-pp/GIZA++-v2/mymath.h deleted file mode 100644 index f8ad926c..00000000 --- a/ext/giza-pp/GIZA++-v2/mymath.h +++ /dev/null @@ -1,9 +0,0 @@ -/* ---------------------------------------------------------------- */ -/* Copyright 1998 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */ -/* Franz Josef Och */ -/* ---------------------------------------------------------------- */ -#ifndef HEADER_MYMATH_DEFINED -#define HEADER_MYMATH_DEFINED -inline double mfabs(double x){return (x<0)?(-x):x;} -#include -#endif diff --git a/ext/giza-pp/GIZA++-v2/mystl.h b/ext/giza-pp/GIZA++-v2/mystl.h deleted file mode 100644 index 2046e11b..00000000 --- a/ext/giza-pp/GIZA++-v2/mystl.h +++ /dev/null @@ -1,321 +0,0 @@ -/* ---------------------------------------------------------------- */ -/* Copyright 1998 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */ -/* Franz Josef Och */ -/* ---------------------------------------------------------------- */ -#ifndef MY_STL_H_DEFINED -#define MY_STL_H_DEFINED - -#include -using namespace std; -#ifdef USE_STLPORT -#ifdef __STL_DEBUG -using namespace _STLD; -#else -using namespace _STL; -#endif -#endif - -#include "myassert.h" -#include -#include - -#include -#define hash_map unordered_map - -#include -#include -#include "mymath.h" -#include "Array2.h" - -#define over_string(a,i) for(unsigned int i=0;i=(a).low();i--) -#define over_arr(a,i) for(int i=(a).low();i<=(a).high();i++) -#define over_arrMAX(a,i,max) for(int i=(a).low();i<=min((a).high(),max-1);i++) -#define backwards_arr(a,i) for(int i=(a).high();i>=(a).low();i--) - -extern double n1mult,n2mult,n3mult; - -inline double realProb(int n1,int n2) -{ - massert(n1<=n2); - iassert(n1>=0&&n2>0); - if(n2==0)n2=1; - return ((double)n1)/(double)n2; -} - -inline double verfProb(int n1,int n2) -{ - double prob = realProb(n1,n2); - if( n1==1 )return prob*n1mult; - else if( n1==2 )return prob*n2mult; - else if( n1==3 )return prob*n3mult; - else - return prob; -} - -inline bool prefix(const string&x,const string&y) -{ - if(y.size()>x.size() ) - return 0; - for(unsigned int i=0;i -int lev(const T&s1,const T&s2) -{ - Array2 > a(s1.size()+1,s2.size()+1,1000); - Array2,vector > > back(s1.size()+1,s2.size()+1,pair(0,0)); - for(unsigned int i=0;i<=s1.size();i++) - for(unsigned int j=0;j<=s2.size();j++) - { - if( i==0&&j==0 ) - a(i,j)=0; - else - { - int aDEL=100,aINS=100,aSUB=100; - if(i>0) - aDEL=a(i-1,j)+1; - if(j>0) - aINS=a(i,j-1)+1; - if(i>0&&j>0) - aSUB=a(i-1,j-1)+ !(s1[i-1]==s2[j-1]); - if( aSUB<=aDEL && aSUB<=aINS ) - { - a(i,j)=aSUB; - back(i,j)=pair(i-1,j-1); - } - else if( aDEL<=aSUB && aDEL<=aINS ) - { - a(i,j)=aDEL; - back(i,j)=pair(i-1,j); - } - else - { - a(i,j)=aINS; - back(i,j)=pair(i,j-1); - } - } - } - return a(s1.size(),s2.size()); -} - -template -float rel_lev(const T&s1,const T&s2) -{ - if( s1.size()==0 ) - return s2.size()==0; - else - return min(1.0,lev(s1,s2)/(double)s1.size()); -}*/ - -template int Hash(const pair&a) -{ return Hash(a.first)+13001*Hash(a.second); } - -template -ostream& operator<<(ostream &out,const pair &ir) -{ - out << "(" << ir.first << "," << ir.second << ")"; - return out; -} - -inline int Hash(const string& s) -{ - int sum=0; - string::const_iterator i=s.begin(),end=s.end(); - for(;i!=end;i++)sum=5*sum+(*i); - return sum; -} -template -class tri -{ -public: - A a; - B b; - C c; - tri(){}; - tri(const A&_a,const B&_b,const C&_c) - : a(_a),b(_b),c(_c) {} -}; -template -bool operator==(const tri&x,const tri&y) -{ return x.a==y.a&&x.b==y.b&&x.c==y.c;} - -template -bool operator<(const tri&x,const tri&y) -{ - if(x.a -class my_hash -{ -public: - int operator()(const T&t)const {return Hash(t);} -}; - -inline int Hash(int value) { return value; } -#define MY_HASH_BASE hash_map > - -template -class leda_h_array : public MY_HASH_BASE -{ -private: - B init; -public: - leda_h_array() : MY_HASH_BASE() {} - leda_h_array(const B&_init) - : MY_HASH_BASE(),init(_init) {} - bool defined(const A&a) const - { return find(a)!=this->end(); } - const B&operator[](const A&a)const - { - typename MY_HASH_BASE::const_iterator pos=find(a); - if( pos==this->end() ) - return init; - else - return pos->second; - } - B&operator[](const A&a) - { - typename MY_HASH_BASE::iterator pos=find(a); - if( pos==this->end() ) - { - insert(MY_HASH_BASE::value_type(a,init)); - pos=find(a); - iassert(pos!=this->end()); - } - return pos->second; - } - const B&initValue()const - {return init;} -}; - -#define forall_defined_h(a,b,c,d) for(typename leda_h_array::const_iterator __jj__=(d).begin();__jj__!=(d).end()&&((c=__jj__->first),1); ++__jj__) -template -ostream & operator<<(ostream&out,const leda_h_array&w) -{ - T t; - bool makeNl=0; - out << "h_array{"; - forall_defined_h(T,U,t,w) - { - if( makeNl ) - out << "\n "; - out << "EL:" << t << " INH:" << w[t] << "."; - makeNl=1; - } - return out << "}\n"; -} - -template -istream & operator>>(istream&in,leda_h_array&) -{ - return in; -} - -template -bool operator==(const leda_h_array&p1,const leda_h_array&p2) -{ - A v; - forall_defined_h(A,B,v,p1) - if( !( p1[v]==p2[v]) ) return 0; - forall_defined_h(A,B,v,p2) - if( !( p1[v]==p2[v]) ) return 0; - return 1; -} - -template -int count_elements(T a,T b) -{ - int c=0; - while(a!=b) - { - a++; - c++; - } - return c; -} - -template -T normalize_if_possible_with_increment(T*a,T*b,int increment) -{ - T sum=0; - for(T*i=a;i!=b;i+=increment) - sum+=*i; - if( sum ) - for(T*i=a;i!=b;i+=increment) - *i/=sum; - else - { - T factor=increment/(b-a); - for(T*i=a;i!=b;i+=increment) - *i=factor; - } - return sum; -} - -template -inline int m_comp_3way(T a,T b,int n) -{ - int _n=0; - while((_n++ -void smooth_standard(T*a,T*b,double p) -{ - int n=b-a; - if( n==0 ) - return; - double pp=p/n; - for(T*i=a;i!=b;++i) - *i = (1.0-p)*(*i)+pp; -} - -template -const T *conv(typename vector::const_iterator i) -{ - return &(*i); -} -#if __GNUC__>2 -template -T *conv(typename vector::iterator i) -{ - return &(*i); -} -#endif - -/*template -const T *conv(const T*x) -{ - return x; -}*/ -template -T *conv(T*x) -{ - return x; -} - -#endif diff --git a/ext/giza-pp/GIZA++-v2/parse.cpp b/ext/giza-pp/GIZA++-v2/parse.cpp deleted file mode 100644 index ebb136e0..00000000 --- a/ext/giza-pp/GIZA++-v2/parse.cpp +++ /dev/null @@ -1,151 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - -/* FJO 01/2001: completely reorganized parameter processing */ - -#include -#include -#include -#include "defs.h" -#include "utility.h" -#include "Globals.h" -#include "D4Tables.h" -#include "D5Tables.h" -#include "ATables.h" -#include "Parameter.h" - -extern bool ONLYALDUMPS; - -void parseConfigFile (char * fname ) - // This functions reads in the configuration file to set up some run-time - // parameters. The parameters are global variables that are defined in - // main.cc and used all over the place in the program - // The format of the configuration file can be explained in the following way - // FORMAT: - // the character '\n' separates lines .. - // lines that start with "//" (skipping over white spaces are considered - // as comments and will be ignored. - // Any other line is considered as an attribute setting instruction and it - // is divided into haves (separated by a colon ":"). The first half is the - // attribute value which consists of the concatenation of all non-white space - // tokens before the colon. These tokens will have spaces eseparating them. - // The attribute vlue is the first token after the colon (any thing after - // it will be ignored ; - // For example : - // if the configuration file has the following entry: - // - // NO. ITERATIONS MODEL 2 : 10 - // - // then the attribute is "NO. ITERATIONS MODEL 2" , and the attribute value - // is "10" (these do not include the quotation marks). - -{ - - string line, word, attrib, attribval ; - ifstream Config_File(fname); - if(!Config_File){ - cerr << "ERROR: Cannot open configuration file " << fname << "!\n" ; - exit(1); - } - - cout << "The following options are from the config file and will be overwritten by any command line options.\n"; - - while(getline(Config_File, line)){ - - istringstream buffer(line); - word = attrib = attribval = "" ; - buffer >> word ; - if (word != "//"){ // if line does not start with "//" (i.e. not a comment) - attrib = word ; - while((buffer >> word) && (word != ":")){ - attrib += " " + word ; - } - if(!(buffer >> attribval)) - { - istringstream buffer2(line); - buffer2>>attrib; - buffer2>>attribval; - } - - // This# is where (1) the configuration file is defined and - // (2) parsing of its attributes occurs. - - if(attrib == "t FILE"){ - t_Filename = attribval; - cout << "\tt file: " << t_Filename << '\n'; - } - else if(attrib == "a FILE"){ - a_Filename = attribval; - cout << "\ta file: " << a_Filename << '\n'; - } - else if(attrib == "d FILE"){ - d_Filename = attribval; - cout << "\td file: " << d_Filename << '\n'; - } - else if(attrib == "n FILE"){ - n_Filename = attribval; - cout << "\tn file: " << n_Filename << '\n'; - } - else if(attrib == "p0 FILE"){ - p0_Filename = attribval; - cout << "\tp0 file: " << p0_Filename << '\n'; - } - else if ( line == ""){} - else if( !makeSetCommand(attrib,attribval,getGlobalParSet(),2) ) - cerr << "ERROR: Unrecognized attribute :" << attrib << '\n'; - } - } -} - - -void parseArguments(int argc, char *argv[]) -{ - int arg = 1; - - if(!strcmp(argv[1], "--h") || !strcmp(argv[1], "--help")){ - printHelp(); - exit(0); - } - if( argv[1][0]=='-' ) - arg=0; - else - parseConfigFile(argv[1]); - while(++arg2 && argv[arg][0]=='-' && argv[arg][1]=='-' ) - { - if( !makeSetCommand(argv[arg]+1,"1",getGlobalParSet(),2)) - cerr << "WARNING: ignoring unrecognized option: "<< argv[arg] << '\n' ; - } - else if( arg+1 -#include -#include -#include -#include -#include -#include - -using namespace std; - -int main(int argc,char**argv) -{ - vectorweights; - vectorfilenames; - for(int i=1;i v1,v2; - map id1,id2; - vector iid1(2),iid2(2); - - string w1(filenames[0]); - string w2(filenames[1]); - - if( w1.length()>4&&w2.length()>4&&((w1.substr(w1.length()-4,w1.length())==".tok" && w2.substr(w2.length()-4,w2.length())==".tok" )|| - (w1.substr(w1.length()-4,w1.length())==".txt" && w2.substr(w2.length()-4,w2.length())==".txt" ) )) - { - w1=w1.substr(0,w1.length()-4); - w2=w2.substr(0,w2.length()-4); - cerr << "w1:"<< w1 << " w2:" << w2 << endl; - } - - - string vocab1(w1),vocab2(w2),snt1,snt2; - unsigned int slashpos=vocab1.rfind('/')+1; - if( slashpos>=vocab1.length() ) slashpos=0; - string vocab1x(vocab1.substr(slashpos,vocab1.length())); - cout << vocab1 << " -> " << vocab1x << endl; - slashpos=vocab2.rfind('/')+1; - if( slashpos>=vocab2.length() ) slashpos=0; - string vocab2x(vocab2.substr(slashpos,vocab2.length())); - cout << vocab2 << " -> " << vocab2x << endl; - snt1=vocab1+"_"+vocab2x+string(".snt"); - snt2=vocab2+"_"+vocab1x+string(".snt"); - vocab1+=string(".vcb"); - vocab2+=string(".vcb"); - - ofstream ovocab1(vocab1.c_str()),ovocab2(vocab2.c_str()),osnt1(snt1.c_str()),osnt2(snt2.c_str()); - for(unsigned int i=0;i t1,t2; - istringstream ii1(line1); - while(ii1>>word) - { - t1.push_back(word); - v1[word]++; - if( id1.find(word)==id1.end() ) - { - iid1.push_back(word); - id1[word]=iid1.size()-1; - } - } - istringstream ii2(line2); - while(ii2>>word) - { - t2.push_back(word); - v2[word]++; - if( id2.find(word)==id2.end() ) - { - iid2.push_back(word); - id2[word]=iid2.size()-1; - } - } - double w=1.0; - if( i/2 -#include -#include -#include "defs.h" -#include "vocab.h" -#include "Perplexity.h" -#include "getSentence.h" -#include "TTables.h" -#include "Globals.h" -#include "Parameter.h" - -void printHelp(void) -{ - cerr << "Usage:\n\n" << Usage << '\n'; - cerr << "Options (these override parameters set in the config file):\n\n"; - cerr << "\t--v \t\t print verbose message, Warning this is not very descriptive and not systematic.\n"; - cerr << "\t--NODUMPS \t Do not write any files to disk (This will over write dump frequency options).\n"; - cerr << "\t--h[elp]\t\tprint this help\n"; - cerr << "\t--p\t\tUse pegging when generating alignments for Model3 training. (Default NO PEGGING)\n"; - cerr << "\t--st\t\tto use a fixed ditribution for the fertility parameters when tranfering from model 2 to model 3 (Default complicated estimation)\n"; - printGIZAPars(cout); -} - - -void generatePerplexityReport(const Perplexity& trainperp, - const Perplexity& testperp, - const Perplexity& trainVperp, - const Perplexity& testVperp, - ostream& of, int trainsize, int testsize, - bool) -{ - unsigned int i, m; - unsigned int m1 = max(trainperp.size(), testperp.size()); - unsigned int m2 = max(trainVperp.size(), testVperp.size()); - m = max(m1,m2); - of << "#trnsz\ttstsz\titer\tmodel\ttrn-pp\t\ttest-pp\t\ttrn-vit-pp\t\ttst-vit-pp\n"; - for (i = 0 ; i & es, - Vector& fs, - ostream& of) - - // just writes a sentece pair to the give output stream, one sentence pair line - // it writes token ids not actual tokens. -{ - WordIndex i, j, l, m; - l = es.size() - 1; - m = fs.size() - 1; - of << "Source sentence length : " << l << " , target : " << m << "\n"; - for (i = 1 ; i <= l ; i++) - of << es[i] << ' '; - of << "\n"; - for (j = 1 ; j <= m ; j++) - of << fs[j] << ' '; - of << "\n"; - -} - -extern short CompactAlignmentFormat; -void printAlignToFile(const Vector& es, - const Vector& fs, - const Vector& evlist, - const Vector& fvlist, - ostream& of2, - const Vector& viterbi_alignment, - int pair_no, double alignment_score) - - // prints the given alignment to alignments file (given it stream pointer) - // in a format recognizable by the draw-alignment tool ... which is of the - // example (each line triple is one sentence pair): - // # sentence caption - // target_word_1 target_word_2 ..... target_word_m - // source_word_1 ({ x y z }) source_word_2 ({ }) .. source_word_n ({w}) - // where x, y, z, and w are positions of target words that each source word - // is connected to. - -{ - WordIndex l, m; - Vector > translations(es.size()); // each english words has a vector - // of zero or more translations . - l = es.size() - 1; - m = fs.size() - 1; - if( CompactAlignmentFormat ) - { - for (WordIndex j = 1 ; j <= m ; j++) - if( viterbi_alignment[j] ) - of2 << viterbi_alignment[j]-1 << ' ' << j-1 << ' '; - of2 << '\n'; - } - else - { - of2 << "# Sentence pair (" << pair_no <<") source length " << l << " target length "<< m << - " alignment score : "<< alignment_score << '\n'; - for (WordIndex j = 1 ; j <= m ; j++){ - of2 << fvlist[fs[j]].word << " " ; - translations[viterbi_alignment[j]].push_back(j); - } - of2 << '\n'; - - for (WordIndex i = 0 ; i <= l ; i++){ - of2 << evlist[es[i]].word << " ({ " ; - for (WordIndex j = 0 ; j < translations[i].size() ; j++) - of2 << translations[i][j] << " " ; - of2 << "}) "; - } - of2 << '\n'; - } -} - - -void printOverlapReport(const tmodel& tTable, - sentenceHandler& testHandler, vcbList& trainEList, - vcbList& trainFList, vcbList& testEList, vcbList& testFList) -{ - set > testCoocur ; - sentPair s ; - /* string unseenCoocurFile = Prefix + ".tst.unseen.cooc" ; - ofstream of_unseenCoocur(unseenCoocurFile.c_str()); - - string seenCoocurFile = Prefix + ".tst.seen.cooc" ; - ofstream of_seenCoocur(seenCoocurFile.c_str()); - */ - testHandler.rewind(); - int seen_coocur = 0, unseen_coocur = 0, srcUnk = 0, trgUnk = 0 ; - while(testHandler.getNextSentence(s)){ - for (WordIndex i = 1 ; i < s.eSent.size() ; i++) - for (WordIndex j = 1 ; j < s.fSent.size() ; j++) - testCoocur.insert(pair (s.eSent[i], s.fSent[j])) ; - } - set >::const_iterator i ; - for (i = testCoocur.begin() ; i != testCoocur.end() ; ++i){ - if (tTable.getProb((*i).first, (*i).second) > PROB_SMOOTH){ - seen_coocur ++ ; - // of_seenCoocur << (*i).first << ' ' << (*i).second << '\n'; - } - else { - unseen_coocur++; - // of_unseenCoocur << (*i).first << ' ' << (*i).second << '\n'; - } - } - - string trgUnkFile = Prefix + ".tst.trg.unk" ; - ofstream of_trgUnk(trgUnkFile.c_str()); - - for (WordIndex i = 0 ; i < testFList.getVocabList().size() && i < testFList.uniqTokens();i++) - if (testFList.getVocabList()[i].freq > 0 && trainFList.getVocabList()[i].freq <= 0){ - of_trgUnk << i << ' ' << testFList.getVocabList()[i].word << ' ' << testFList.getVocabList()[i].freq - << '\n'; - trgUnk++ ; - } - string srcUnkFile = Prefix + ".tst.src.unk" ; - ofstream of_srcUnk(srcUnkFile.c_str()); - - for (WordIndex j = 0 ; j < testEList.getVocabList().size() && j < testEList.uniqTokens();j++) - if (testEList.getVocabList()[j].freq > 0 && trainEList.getVocabList()[j].freq <= 0){ - srcUnk++ ; - of_srcUnk << j << ' ' << testEList.getVocabList()[j].word << ' ' << testEList.getVocabList()[j].freq - << '\n'; - } - string summaryFile = Prefix + ".tst.stats" ; - ofstream of_summary(summaryFile.c_str()); - of_summary << "\t\t STATISTICS ABOUT TEST CORPUS\n\n"; - of_summary << "source unique tokens: " << testEList.uniqTokens() << '\n'; - of_summary << "target unique tokens: " << testFList.uniqTokens() << '\n'; - of_summary << "unique unseen source tokens: " << srcUnk << '\n'; - of_summary << "unique unseen target tokens: " << trgUnk << '\n'; - of_summary << "cooccurrences not found in the final t table: " << unseen_coocur << '\n'; - of_summary << "cooccurrences found in the final t table: " << seen_coocur << '\n'; - -} - diff --git a/ext/giza-pp/GIZA++-v2/small_snt2cooc.cpp b/ext/giza-pp/GIZA++-v2/small_snt2cooc.cpp deleted file mode 100644 index 1ce76481..00000000 --- a/ext/giza-pp/GIZA++-v2/small_snt2cooc.cpp +++ /dev/null @@ -1,91 +0,0 @@ -#include -#include -#include -#include -#include -#include - - -using namespace std; - -class myset { - private: - vector data; - void check_cap() { - size_t dc = data.capacity(); - if (dc - data.size() < 3) { - if (dc < 4) { dc = 2; } - if (dc < 18) { dc*=2; } else { dc+=15; } - data.reserve(dc); - } - } - public: - typedef vector::iterator iterator; - void insert(int x) { - if (data.size() == 0) { data.push_back(x); return; } - vector::iterator p = lower_bound(data.begin(), data.end(), x); - int i = p - data.begin(); - if (i >= data.size()) { check_cap(); data.push_back(x); return; } - if (*p == x) return; - check_cap(); - data.insert(data.begin() + i,x); - } - iterator begin() { return data.begin(); } - iterator end() { return data.end(); } -}; - -//#include -// typedef std::set intset; -//#include -// typedef __gnu_cxx::hash_set intset; -typedef myset intset; - - -int main(int argc,char **argv) -{ - if( argc!=2 ) - { - cerr << "Usage: " << argv[0] << " snt12 \n"; - cerr << "Converts GIZA++ snt-format into plain text.\n"; - exit(1); - } - ifstream t(argv[1]); - string line1,line2,line3; - vector vsi(400000); - int nLine=0; - int totalElems=0; - while(getline(t,line1)&&getline(t,line2)&&getline(t,line3)) - { - istringstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str()); - double count; - string word; - eingabe1>>count; - vectorl1,l2; - while(eingabe2>>word) - l1.push_back(atoi(word.c_str())); - while(eingabe3>>word) - l2.push_back(atoi(word.c_str())); - if( ((++nLine)%1000)==0 ) - cerr << "line " << nLine << '\n'; - for(unsigned int j=0;j=int(vsi.size()) ) - { - cerr << "I have to resize: " << l1[i] << endl; - vsi.resize(l1[i]+1000); - } - intset&theset=vsi[l1[i]]; - for(unsigned int j=0;j::iterator i=vsi.begin();i != vsi.end(); ++i) { - for(intset::iterator j=i->begin();j!=i->end();++j) - cout << vi << " " << *j << endl; - ++vi; - } -} - diff --git a/ext/giza-pp/GIZA++-v2/snt2cooc.cpp b/ext/giza-pp/GIZA++-v2/snt2cooc.cpp deleted file mode 100644 index c6af6d49..00000000 --- a/ext/giza-pp/GIZA++-v2/snt2cooc.cpp +++ /dev/null @@ -1,107 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -void readVoc(istream&in,map&voc) -{ - string line,s1,s2; - voc["1"]="UNK"; - if( !in )cerr <<"Vocabulary does not exist.\n"; - while(getline(in,line)) - { - istringstream eingabe(line); - if( !(eingabe>>s1>>s2)) - cerr << "ERROR in vocabulary '" << line << "'\n"; - voc[s1]=s2; - } -} - -int maxElems=0; -int main(int argc,char **argv) -{ - if( argc!=4&&argc!=5 ) - { - cerr << "Usage: " << argv[0] << " vcb1 vcb2 snt12 \n"; - cerr << "Converts GIZA++ snt-format into plain text.\n"; - exit(1); - } - bool counts=0; - if( argc==5 ) - { - if(string(argv[4])!="-counts") - cerr << "ERROR: wrong option " << argv[5] << endl; - counts=1; - maxElems=10000000; - } - ifstream v1(argv[1]),v2(argv[2]),t(argv[3]); - mapvoc1,voc2; - readVoc(v1,voc1); - readVoc(v2,voc2); - string line1,line2,line3; - vector > vsi(voc1.size()+1000); - int nLine=0; - int totalElems=0; - while(getline(t,line1)&&getline(t,line2)&&getline(t,line3)) - { - istringstream eingabe1(line1),eingabe2(line2),eingabe3(line3); - double count; - string word; - eingabe1>>count; - vectorl1,l2; - while(eingabe2>>word) - l1.push_back(atoi(word.c_str())); - while(eingabe3>>word) - l2.push_back(atoi(word.c_str())); - if( ((++nLine)%1000)==0 ) - cerr << "line " << nLine << '\n'; - totalElems-=vsi[0].size(); - for(unsigned int j=0;j=int(vsi.size()) ) - { - cerr << "I have to resize: " << l1[i] << endl; - vsi.resize(l1[i]+1); - } - map&theset=vsi[l1[i]]; - totalElems-=theset.size(); - for(unsigned int j=0;jmaxElems&&maxElems ) - { - cerr << "INFO: print out " << totalElems << " entries.\n"; - for(unsigned int i=0;i::const_iterator j=vsi[i].begin();j!=vsi[i].end();++j) - { - if(counts==1 ) - cout << j->second << " " << i << " " << j->first << '\n'; - else - cout << i << " " << j->first << '\n'; - } - totalElems=0; - vsi.clear(); - vsi.resize(voc1.size()+1000); - } - } - cerr << "END.\n"; - for(unsigned int i=0;i::const_iterator j=vsi[i].begin();j!=vsi[i].end();++j) - { - if(counts==1 ) - cout << j->second << " " << i << " " << j->first << '\n'; - else - cout << i << " " << j->first << '\n'; - } -} - diff --git a/ext/giza-pp/GIZA++-v2/snt2plain.cpp b/ext/giza-pp/GIZA++-v2/snt2plain.cpp deleted file mode 100644 index 3eb99ad0..00000000 --- a/ext/giza-pp/GIZA++-v2/snt2plain.cpp +++ /dev/null @@ -1,91 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -void readVoc(istream&in,map&voc) -{ - string line,s1,s2; - voc["1"]="UNK"; - if( !in )cerr <<"Vocabulary does not exist.\n"; - while(getline(in,line)) - { - istringstream eingabe(line); - if( !(eingabe>>s1>>s2)) - cerr << "ERROR in vocabulary '" << line << "'\n"; - voc[s1]=s2; - } -} - -int main(int argc,char **argv) -{ - if( argc!=5&&argc!=6 ) - { - cerr << "Usage: " << argv[0] << " vcb1 vcb2 snt12 output_prefix [ -counts ]\n"; - cerr << "Converts GIZA++ snt-format into plain text.\n"; - exit(1); - } - bool counts=0; - if( argc==6 ) - { - if(string(argv[5])!="-counts") - cerr << "ERROR: wrong option " << argv[5] << endl; - counts=1; - } - ifstream v1(argv[1]),v2(argv[2]),t(argv[3]); - string prefix(argv[4]); - string outfil1=prefix+"1.txt"; - string outfil2=prefix+"2.txt"; - ofstream out1(outfil1.c_str()); - ofstream out2(outfil2.c_str()); - mapvoc1,voc2; - readVoc(v1,voc1); - readVoc(v2,voc2); - int source=0,target=0; - string line1,line2,line3; - int printed=0; - while(getline(t,line1)&&getline(t,line2)&&getline(t,line3)) - { - istringstream eingabe1(line1),eingabe2(line2),eingabe3(line3); - double count; - string word; - eingabe1>>count; - vectorl1,l2; - while(eingabe2>>word) - l1.push_back(word); - while(eingabe3>>word) - l2.push_back(word); - if( counts ) - cout << count << '\n'; - for(unsigned int p=0;p& mkcls1.log - rm PLAIN1.txt - mkcls -m2 -pPLAIN2.txt -c50 -V$2.classes opt >& mkcls2.log - rm PLAIN2.txt - GIZA++ -S $1 -T $2 -C $3 -p0 0.98 -o GIZA++ >& GIZA++.log - -endif diff --git a/ext/giza-pp/GIZA++-v2/transpair_model1.h b/ext/giza-pp/GIZA++-v2/transpair_model1.h deleted file mode 100644 index dd1425dc..00000000 --- a/ext/giza-pp/GIZA++-v2/transpair_model1.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - -Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef transpair_model1_h_fjo_defined -#define transpair_model1_h_fjo_defined -//#include "logprob.h" -#include "defs.h" -#include "Array2.h" -#include "defs.h" -#include "Vector.h" -#include "NTables.h" -#include "ATables.h" -#include "TTables.h" -#include "alignment.h" -#include -#include -#include "Array2.h" -#include "mystl.h" - -class transpair_model1 -{ - public: - bool verboseTP; - Array2 > t; - WordIndex l, m; - Vector E,F; - void setMode(bool) - {} - transpair_model1(const Vector&es, const Vector&fs, const tmodel&tTable) - : verboseTP(0),t(es.size(), fs.size()),l(es.size()-1), m(fs.size()-1),E(es),F(fs) - { - WordIndex l=es.size()-1,m=fs.size()-1; - for(WordIndex i=0;i<=l;i++) - for(WordIndex j=1;j<=m;j++) - { - t(i, j)=tTable.getProb(es[i], fs[j]); - if( !(t(i,j)>=PROB_SMOOTH) ) - cerr << "ERROR IN PROBABILITY: " << t(i,j) << " " << PROB_SMOOTH << endl; - } - } - /* transpair_model1(const Vector&es, const Vector&fs) - : verboseTP(0),t(es.size(), fs.size()),l(es.size()-1), m(fs.size()-1),E(es),F(fs) - { - WordIndex l=es.size()-1,m=fs.size()-1; - for(WordIndex i=0;i<=l;i++) - for(WordIndex j=1;j<=m;j++) - { - const string&estr=globeTrainVcbList->getVocabList()[es[i]].word; - const string&fstr=globfTrainVcbList->getVocabList()[fs[j]].word; - if( lev(estr,fstr)==0 ) - t(i,j)=1.0; - else - t(i,j)=1/100.0; - massert( t(i,j)>=PROB_SMOOTH ); - } -}*/ - WordIndex get_l()const - {return l;} - WordIndex get_m()const - {return m;} - const PROB&get_t(WordIndex i, WordIndex j)const - {massert( t(i,j)>=PROB_SMOOTH); - return t(i, j);} - WordIndex get_es(int i)const {return E[i];} - WordIndex get_fs(int j)const {return F[j];} - bool greedyHillClimbing()const - {return 0;} - void computeScores(const alignment&,vector&)const - {} - LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double=-1.0)const - { - int old_i=a(j); - return (t(new_i, j) /t(old_i, j)); - } - LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double=-1.0)const - { - WordIndex i1=a(j1), i2=a(j2); - return (t(i2, j1)/t(i1, j1))*(t(i1, j2)/t(i2, j2)); - } - LogProb prob_of_target_and_alignment_given_source(const alignment&al)const - { - LogProb prob=1.0; - int lp1=al.get_l()+1; - for(unsigned int j=1;j<=al.get_m();++j) - prob*=t(al(j),j)/lp1; - return prob; - } -}; -#endif diff --git a/ext/giza-pp/GIZA++-v2/transpair_model2.h b/ext/giza-pp/GIZA++-v2/transpair_model2.h deleted file mode 100644 index 751ce528..00000000 --- a/ext/giza-pp/GIZA++-v2/transpair_model2.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - -Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef transpair_model2_defined_h -#define transpair_model2_defined_h - -#include "defs.h" -#include "Vector.h" -#include "NTables.h" -#include "ATables.h" -#include "TTables.h" -#include "alignment.h" -#include -#include "transpair_model1.h" - - -class transpair_model2 : public transpair_model1 -{ - protected: - Array2 > a; - public: - transpair_model2(const Vector&es, const Vector&fs, const tmodel&tTable, - const amodel&aTable) - : transpair_model1(es,fs,tTable),a(es.size(),fs.size()) - { - for(WordIndex i=0;i<=l;i++) - for(WordIndex j=1;j<=m;j++) - a(i, j)=aTable.getValue(i, j, l, m); - } - const PROB&get_a(WordIndex i, WordIndex j)const - {return a(i, j);} -}; -#endif diff --git a/ext/giza-pp/GIZA++-v2/transpair_model3.cpp b/ext/giza-pp/GIZA++-v2/transpair_model3.cpp deleted file mode 100644 index 0ab4c547..00000000 --- a/ext/giza-pp/GIZA++-v2/transpair_model3.cpp +++ /dev/null @@ -1,197 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/*-- -transpair_model3: representation of a translation pair for model3 training -allowing for fast access (esp. to t table). - -Franz Josef Och (30/07/99) ---*/ -#include "transpair_model3.h" -#include - -transpair_model3::transpair_model3(const Vector&es, const Vector&fs, tmodel&tTable, amodel&aTable, amodel&dTable, nmodel&nTable, double _p1, double _p0, void*) - : transpair_model2(es,fs,tTable,aTable),d(es.size(), fs.size()),n(es.size(), MAX_FERTILITY+1), p0(_p0), p1(_p1) -{ - WordIndex l=es.size()-1,m=fs.size()-1; - for(WordIndex i=0;i<=l;i++) - { - for(WordIndex j=1;j<=m;j++) - d(i, j)=dTable.getValue(j, i, l, m); - if( i>0 ) - { - for(WordIndex f=0;f0 ) - out << "(fert:"< " << total << '\n'; - } - for (WordIndex j = 1 ; j <= m ; j++) - { - total*= get_t(al(j), j) ; - massert( get_t(al(j), j)>=PROB_SMOOTH ); - if( verb) cerr << "IBM-3: t of " << j << " " << al(j) << ": " << get_t(al(j), j) << " -> " << total << '\n'; - if (al(j)) - { - total *= get_d(al(j), j); - if( verb) cerr << "IBM-3: d of " << j << ": " << get_d(al(j), j) << " -> " << total << '\n'; - } - } - return total?total:zero; -} - - -void transpair_model3::computeScores(const alignment&al,vector&d)const -{ - LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ; - total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0))); - for (WordIndex i = 1 ; i <= al.fert(0) ; i++) - total1 *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ; - for (WordIndex i = 1 ; i <= l ; i++) - { - total2 *= get_fertility(i, al.fert(i)) * (LogProb) factorial(al.fert(i)); - } - for (WordIndex j = 1 ; j <= m ; j++) - { - total3*= get_t(al(j), j) ; - massert( get_t(al(j), j)>=PROB_SMOOTH ); - if (al(j)) - { - total4 *= get_d(al(j), j); - } - } - d.push_back(total1);//5 - d.push_back(total2);//6 - d.push_back(total3);//7 - d.push_back(total4);//8 -} diff --git a/ext/giza-pp/GIZA++-v2/transpair_model3.h b/ext/giza-pp/GIZA++-v2/transpair_model3.h deleted file mode 100644 index 9c07fd91..00000000 --- a/ext/giza-pp/GIZA++-v2/transpair_model3.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -/*-- -transpair_model3: representation of a translation pair for model3 training -allowing for fast access (esp. to t table). - -Franz Josef Och (30/07/99) ---*/ -#ifndef transpair_model3_h_fjo_defined -#define transpair_model3_h_fjo_defined -#include "Array2.h" -#include "defs.h" -#include "Vector.h" -#include "NTables.h" -#include "ATables.h" -#include "TTables.h" -#include "alignment.h" -#include -#include "transpair_model2.h" - -extern double factorial(int n); -inline bool doubleEqual(const double a, const double b) -{ - if( a==b ) - return 1.0; - bool bl=fabs(1.0-a/b)<1e-10; - if( bl ) - return 1; - else - { - cerr << "DIFFERENT: " << a << " " << b << " " << a/b << " " << 1.0-a/b << endl; - return 0; - } -} - - -class transpair_model3 : public transpair_model2 -{ - protected: - Array2 > d, n; - PROB p0, p1; - public: - typedef transpair_model3 simpler_transpair_model; - transpair_model3(const Vector&es, const Vector&fs, tmodel&tTable, - amodel&aTable, amodel&dTable, nmodel&nTable, - double _p1, double _p0, void*x=0); - const PROB&get_d(WordIndex i, WordIndex j)const - {return d(i, j);} - const PROB&get_a(WordIndex i, WordIndex j)const - {return a(i, j);} - const PROB&get_fertility(WordIndex i, WordIndex f)const - {massert(i>0);return (f>=MAX_FERTILITY)?n(i, MAX_FERTILITY):n(i, f);} - int modelnr()const{return 3;} - LogProb scoreOfAlignmentForChange(const alignment&)const - {return -1.0; } - LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j, double thisValue=-1.0,bool withDistortions=1)const; - LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2, double thisValue=-1.0,bool withDistortions=1)const ; - LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const; - LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const; - friend ostream&operator<<(ostream&out, const transpair_model3&m); - LogProb prob_of_target_and_alignment_given_source(const alignment&al,bool verb=0)const; - bool isSubOptimal()const{return 1;} - void computeScores(const alignment&al,vector&d)const; -}; -#endif diff --git a/ext/giza-pp/GIZA++-v2/transpair_model4.cpp b/ext/giza-pp/GIZA++-v2/transpair_model4.cpp deleted file mode 100644 index ebc2666a..00000000 --- a/ext/giza-pp/GIZA++-v2/transpair_model4.cpp +++ /dev/null @@ -1,179 +0,0 @@ -/* - -Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "transpair_model4.h" -#include "Parameter.h" - -GLOBAL_PARAMETER(float,d4modelsmooth_factor,"model4SmoothFactor","smooting parameter for alignment probabilities in Model 4",PARLEV_SMOOTH,0.2); - -LogProb transpair_model4::_scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double)const -{ - LogProb a_prob=prob_of_target_and_alignment_given_source(a); - alignment b(a); - b.set(j, new_i); - LogProb b_prob=prob_of_target_and_alignment_given_source(b); - if( a_prob ) - return b_prob/a_prob; - else if( b_prob ) - return 1e20; - else - return 1.0; -} -LogProb transpair_model4::_scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double)const -{ - LogProb a_prob=prob_of_target_and_alignment_given_source(a); - alignment b(a); - b.set(j1, a(j2)); - b.set(j2, a(j1)); - LogProb b_prob=prob_of_target_and_alignment_given_source(b); - if( a_prob ) - return b_prob/a_prob; - else if( b_prob ) - return 1e20; - else - return 1.0; -} -//increasing efficiency: no copy of alignment (calc. everything incrementally) -LogProb transpair_model4::scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue)const -{ - if( a(j)==new_i ) - return 1.0; - LogProb change=transpair_model3::scoreOfMove(a,new_i,j,-1.0,0); - LogProb a_prob=thisValue; - if(a_prob<0.0 ) - a_prob=prob_of_target_and_alignment_given_source(a,2); - massert(a_prob==prob_of_target_and_alignment_given_source(a,2)); - WordIndex old_i=a(j); - //alignment b(a); - const_cast(a).set(j,new_i); - LogProb b_prob=prob_of_target_and_alignment_given_source(a,2); - const_cast(a).set(j,old_i); - change*=b_prob/a_prob; - return change; -} -//increasing efficiency: no copy of alignment (calc. everything incrementally) -LogProb transpair_model4::scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const -{ - WordIndex aj1=a(j1),aj2=a(j2); - if( aj1==aj2 ) - return 1.0; - LogProb change=transpair_model3::scoreOfSwap(a,j1,j2,-1.0,0); - LogProb a_prob=thisValue; - if( a_prob<0.0 ) - a_prob=prob_of_target_and_alignment_given_source(a,2); - massert(a_prob==prob_of_target_and_alignment_given_source(a,2)); - - //alignment b(a); - const_cast(a).set(j1,aj2); - const_cast(a).set(j2,aj1); - LogProb b_prob=prob_of_target_and_alignment_given_source(a,2); - const_cast(a).set(j1,aj1); - const_cast(a).set(j2,aj2); - - if( verboseTP ) - cerr << "scoreOfSwap: " << change << ' ' << a_prob << ' ' << b_prob << ' ' << endl; - change*=b_prob/a_prob; - if( verboseTP ) - cerr << "resulting: " << change << " should be " << _scoreOfSwap(a,j1,j2) << endl; - return change; -} - -LogProb transpair_model4::prob_of_target_and_alignment_given_source_1(const alignment&al,bool verb)const -{ - LogProb total = 1.0 ; - total *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0))); - if( verb) cerr << "IBM-4: (1-p1)^(m-2 f0)*p1^f0: " << total << endl; - for (WordIndex i = 1 ; i <= al.fert(0) ; i++) - total *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ; - if( verb) cerr << "IBM-4: +NULL:binomial+distortion " << total << endl; - for (WordIndex i = 1 ; i <= l ; i++) - { - total *= get_fertility(i, al.fert(i));// * (LogProb) factorial(al.fert(i)); - if( verb) cerr << "IBM-4: fertility of " << i << " " << get_fertility(i, al.fert(i)) << " -> " << total << endl; - } - for (WordIndex j = 1 ; j <= m ; j++) - { - total*= get_t(al(j), j) ; - if( verb) cerr << "IBM-4: t of j:" << j << " i:" << al(j) << ": " << get_t(al(j), j) << " -> " << total << endl; - } - return total; -} - -LogProb transpair_model4::prob_of_target_and_alignment_given_source(const alignment&al, short distortionType,bool verb)const -{ - LogProb total = 1.0 ; - static const LogProb almostZero = 1E-299 ; - if( distortionType&1 ) - { - total *= prob_of_target_and_alignment_given_source_1(al,verb); - } - if( distortionType&2 ) - { - for(WordIndex j=1;j<=m;j++) - if( al(j) ) - if( al.get_head(al(j))==j) - { - int ep=al.prev_cept(al(j)); - float x2=probFirst[ep](j,al.get_center(ep)); - massert(x2<=1.0); - total*=x2; - if( verb) cerr << "IBM-4: d=1 of " << j << ": " << x2 << " -> " << total << endl; - } - else - { - float x2=probSecond(j,al.prev_in_cept(j)); - massert(x2<=1.0); - total*=x2; - if( verb) cerr << "IBM-4: d>1 of " << j << ": " << x2 << " -> " << total << endl; - } - } - return total?total:almostZero; -} - -void transpair_model4::computeScores(const alignment&al,vector&d)const -{ - LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ; - total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0))); - for (WordIndex i = 1 ; i <= al.fert(0) ; i++) - total1 *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ; - for (WordIndex i = 1 ; i <= l ; i++) - total2 *= get_fertility(i, al.fert(i));// * (LogProb) factorial(al.fert(i)); - for (WordIndex j = 1 ; j <= m ; j++) - total3*= get_t(al(j), j) ; - for(WordIndex j=1;j<=m;j++) - if( al(j) ) - if( al.get_head(al(j))==j) - { - int ep=al.prev_cept(al(j)); - float x2=probFirst[ep](j,al.get_center(ep)); - total4*=x2; - } - else - { - float x2=probSecond(j,al.prev_in_cept(j)); - total4*=x2; - } - d.push_back(total1);//9 - d.push_back(total2);//10 - d.push_back(total3);//11 - d.push_back(total4);//12 -} diff --git a/ext/giza-pp/GIZA++-v2/transpair_model4.h b/ext/giza-pp/GIZA++-v2/transpair_model4.h deleted file mode 100644 index 730fbe7f..00000000 --- a/ext/giza-pp/GIZA++-v2/transpair_model4.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - -Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef transpair_model4_h_fjo_defined -#define transpair_model4_h_fjo_defined -#include "Array2.h" -#include "defs.h" -#include "Vector.h" -#include "NTables.h" -#include "ATables.h" -#include "TTables.h" -#include "alignment.h" -#include "D4Tables.h" -#include "transpair_model3.h" - -extern double factorial(int n); - -class transpair_model4 : public transpair_model3 -{ - private: - d4model&d4m; - Array2 probSecond; - Vector > probFirst; - public: - typedef transpair_model3 simpler_transpair_model; - transpair_model4(const Vector&es, const Vector&fs, tmodel&tTable, amodel&aTable, amodel&dTable, nmodel&nTable, double _p1, double _p0,d4model*_d4m) - : transpair_model3(es, fs, tTable, aTable, dTable, nTable, _p1, _p0), - d4m(*_d4m),probSecond(m+1,m+1,0.0),probFirst(l+1) - { - for(unsigned int j1=1;j1<=m;++j1) - for(unsigned int j2=1;j2 &pf=probFirst[i]=Array2(m+1,m+1,0.0); - for(unsigned int j1=1;j1<=m;++j1) - { - map::const_iterator ci=d4m.getProb_first_iterator(d4m.ewordclasses.getClass(get_es(i)),d4m.fwordclasses.getClass(get_fs(j1)),l,m); - for(unsigned int j2=0;j2<=m;++j2) - { - pf(j1,j2)=d4m.getProb_first_withiterator(j1,j2,m,ci); - massert(pf(j1,j2)==d4m.getProb_first(j1,j2,d4m.ewordclasses.getClass(get_es(i)),d4m.fwordclasses.getClass(get_fs(j1)),l,m)); - } - } - } - } - LogProb prob_of_target_and_alignment_given_source_1(const alignment&al,bool verb)const; - LogProb scoreOfAlignmentForChange(const alignment&a)const - {return prob_of_target_and_alignment_given_source(a,2); } - LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const; - LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ; - LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const; - LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ; - int modelnr()const{return 4;} - LogProb prob_of_target_and_alignment_given_source(const alignment&al, short distortionType=3,bool verb=0)const; - void computeScores(const alignment&al,vector&d)const; -}; -#endif diff --git a/ext/giza-pp/GIZA++-v2/transpair_model5.cpp b/ext/giza-pp/GIZA++-v2/transpair_model5.cpp deleted file mode 100644 index 7baa5ca7..00000000 --- a/ext/giza-pp/GIZA++-v2/transpair_model5.cpp +++ /dev/null @@ -1,243 +0,0 @@ -/* - -Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "transpair_model5.h" -#include "Parameter.h" - -int m5scorefound=0,m5scorenotfound=0; - -GLOBAL_PARAMETER(float,d5modelsmooth_factor,"model5SmoothFactor","smooting parameter for distortion probabilities in Model 5 (linear interpolation with constant)",PARLEV_SMOOTH,0.1); -float d5modelsmooth_countoffset=0.0; - -LogProb transpair_model5::_scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double)const -{ - if( doModel4Scoring ) - return transpair_model4::_scoreOfMove(a,new_i,j); - alignment b(a); - b.set(j, new_i); - LogProb a_prob=prob_of_target_and_alignment_given_source(a); - LogProb b_prob=prob_of_target_and_alignment_given_source(b); - if( a_prob ) - return b_prob/a_prob; - else if( b_prob ) - return 1e20; - else - return 1.0; -} -LogProb transpair_model5::_scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const -{ - if( doModel4Scoring ) - return transpair_model4::_scoreOfSwap(a,j1,j2,thisValue); - alignment b(a); - b.set(j1, a(j2)); - b.set(j2, a(j1)); - LogProb a_prob=prob_of_target_and_alignment_given_source(a); - LogProb b_prob=prob_of_target_and_alignment_given_source(b); - assert(a_prob); - assert(b_prob); - if( a_prob ) - return b_prob/a_prob; - else if( b_prob ) - return 1e20; - else - return 1.0; -} - -//increasing efficiency: no copy of alignment (calc. everything incrementally) -LogProb transpair_model5::scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue)const -{ - if( doModel4Scoring ) - return transpair_model4::scoreOfMove(a,new_i,j,thisValue); - alignment b(a); - b.set(j,new_i); - - LogProb change; - const WordIndex old_i=a(j); - WordIndex f0=a.fert(0); - if (old_i == new_i) - change=1.0; - else if (old_i == 0) - change=((double)p0*p0/p1) * - ((f0*(m-f0+1.0)) / ((m-2*f0+1)*(m-2*f0+2.0))) * - ((PROB)(1.0)) * - (get_fertility(new_i, a.fert(new_i)+1) / get_fertility(new_i, a.fert(new_i)))* - (t(new_i, j)/t(old_i, j))* - 1.0; - else if (new_i == 0) - change=(double(p1) / (p0*p0)) * - (double((m-2*f0)*(m-2*f0-1))/((1+f0)*(m-f0))) * - (1.0) * - (get_fertility(old_i, a.fert(old_i)-1) /get_fertility(old_i, a.fert(old_i)))* - (t(new_i, j) /t(old_i, j)) * - (1.0); - else - change=(1.0) * - (get_fertility(old_i,a.fert(old_i)-1) / get_fertility(old_i,a.fert(old_i))) * - (get_fertility(new_i,a.fert(new_i)+1) /get_fertility(new_i,a.fert(new_i))) * - (t(new_i,j)/t(old_i,j)) * - (1.0); - LogProb a_prob=thisValue; - if( a_prob<0.0 ) - a_prob=prob_of_target_and_alignment_given_source(a,2); - massert(a_prob==prob_of_target_and_alignment_given_source(a,2)); - - LogProb b_prob=prob_of_target_and_alignment_given_source(b,2); - change*=b_prob/a_prob; - return change; -} -LogProb transpair_model5::scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const -{ - if( doModel4Scoring ) - return transpair_model4::scoreOfSwap(a,j1,j2,thisValue); - alignment b(a); - b.set(j1,a(j2)); - b.set(j2,a(j1)); - LogProb change=transpair_model3::scoreOfSwap(a,j1,j2,-1.0,0); - LogProb a_prob=thisValue; - if( a_prob<0.0 ) - a_prob=prob_of_target_and_alignment_given_source(a,2); - massert(a_prob==prob_of_target_and_alignment_given_source(a,2)); - LogProb b_prob=prob_of_target_and_alignment_given_source(b,2); - change*=b_prob/a_prob; - return change; -} - -LogProb transpair_model5::prob_of_target_and_alignment_given_source(const alignment&al, short distortionType,bool verb)const -{ - if( doModel4Scoring ) - return transpair_model4::prob_of_target_and_alignment_given_source(al,distortionType); - LogProb total = 1.0 ; - static const LogProb almostZero = 1E-299 ; - double x2; - if( distortionType&1 ) - { - total *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0))); - if( verb) cerr << "IBM-5: (1-p1)^(m-2 f0)*p1^f0: " << total << endl; - for (WordIndex i = 1 ; i <= al.fert(0) ; i++) - total *= double(m - al.fert(0) - i + 1) / i ; // IBM-5 is not deficient! - if( verb) cerr << "IBM-5: +NULL:binomial+distortion " << total << endl; - for (WordIndex i = 1 ; i <= l ; i++) - { - total *= get_fertility(i, al.fert(i)); - if( verb) cerr << "IBM-5: fertility of " << i << " " << get_fertility(i, al.fert(i)) << " -> " << total << endl; - } - for (WordIndex j = 1 ; j <= m ; j++) - { - total*= get_t(al(j), j) ; - if( verb) cerr << "IBM-5: t of j:" << j << " i:" << al(j) << ": " << get_t(al(j), j) << " -> " << total << endl; - } - } - if( distortionType&2 ) - { - PositionIndex prev_cept=0; - PositionIndex vac_all=m; - Vector vac(m+1,0); - for(WordIndex i=1;i<=l;i++) - { - PositionIndex cur_j=al.als_i[i]; - PositionIndex prev_j=0; - PositionIndex k=0; - if(cur_j) { // process first word of cept - k++; - // previous position - total*= (x2=d5m.getProb_first(vacancies(vac,cur_j),vacancies(vac,al.get_center(prev_cept)),d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-al.fert(i)+k)); - - vac_all--; - assert(vac[cur_j]==0); - vac[cur_j]=1; - - if( verb) cerr << "IBM-5: d=1 of " << cur_j << ": " << x2 << " -> " << total << endl; - prev_j=cur_j; - cur_j=al.als_j[cur_j].next; - } - while(cur_j) { // process following words of cept - k++; - // previous position - int vprev=vacancies(vac,prev_j); - total*= (x2=d5m.getProb_bigger(vacancies(vac,cur_j),vprev,d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-vprev/*war weg*/-al.fert(i)+k)); - - - vac_all--; - vac[cur_j]=1; - - - if( verb) cerr << "IBM-5: d>1 of " << cur_j << ": " << x2 << " -> " << total << endl; - prev_j=cur_j; - cur_j=al.als_j[cur_j].next; - } - assert(k==al.fert(i)); - if( k ) - prev_cept=i; - } - assert(vac_all==al.fert(0)); - } - total = total?total:almostZero; - return total; -} - - -void transpair_model5::computeScores(const alignment&al,vector&d)const -{ - LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ; - total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0))); - for (WordIndex i = 1 ; i <= al.fert(0) ; i++) - total1 *= double(m - al.fert(0) - i + 1) / i ; // IBM-5 is not deficient! - for (WordIndex i = 1 ; i <= l ; i++) - total2 *= get_fertility(i, al.fert(i)); - for (WordIndex j = 1 ; j <= m ; j++) - total3*= get_t(al(j), j) ; - PositionIndex prev_cept=0; - PositionIndex vac_all=m; - Vector vac(m+1,0); - for(WordIndex i=1;i<=l;i++) - { - PositionIndex cur_j=al.als_i[i]; - PositionIndex prev_j=0; - PositionIndex k=0; - if(cur_j) { // process first word of cept - k++; - total4*=d5m.getProb_first(vacancies(vac,cur_j),vacancies(vac,al.get_center(prev_cept)),d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-al.fert(i)+k); - vac_all--; - assert(vac[cur_j]==0); - vac[cur_j]=1; - prev_j=cur_j; - cur_j=al.als_j[cur_j].next; - } - while(cur_j) { // process following words of cept - k++; - int vprev=vacancies(vac,prev_j); - total4*=d5m.getProb_bigger(vacancies(vac,cur_j),vprev,d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-vprev/*war weg*/-al.fert(i)+k); - vac_all--; - vac[cur_j]=1; - prev_j=cur_j; - cur_j=al.als_j[cur_j].next; - } - assert(k==al.fert(i)); - if( k ) - prev_cept=i; - } - assert(vac_all==al.fert(0)); - d.push_back(total1);//13 - d.push_back(total2);//14 - d.push_back(total3);//15 - d.push_back(total4);//16 -} diff --git a/ext/giza-pp/GIZA++-v2/transpair_model5.h b/ext/giza-pp/GIZA++-v2/transpair_model5.h deleted file mode 100644 index 5ecf49dd..00000000 --- a/ext/giza-pp/GIZA++-v2/transpair_model5.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - -Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef transpair_model5_h_fjo_defined -#define transpair_model5_h_fjo_defined -#include "Array2.h" -#include "defs.h" -#include "Vector.h" -#include "NTables.h" -#include "ATables.h" -#include "TTables.h" -#include "alignment.h" -#include "D5Tables.h" -#include "transpair_model4.h" - -extern double factorial(int n); - -inline int vacancies(const Vector&vac,int u) -{ - int n=0; - const char *i=&(vac[0])+1; - const char *end=&(vac[0])+u+1; - while(i,LogProb> scores[4]; - transpair_model5(const Vector&es, const Vector&fs, tmodel&tTable, - amodel&aTable, amodel&dTable, nmodel&nTable, double _p1, double _p0, - const d5model*_d5m) - : transpair_model4(es, fs, tTable, aTable, dTable, nTable, _p1, _p0,&_d5m->d4m),d5m(*_d5m),doModel4Scoring(0) {} - LogProb scoreOfAlignmentForChange(const alignment&a)const - { - if( doModel4Scoring ) - return transpair_model4::prob_of_target_and_alignment_given_source(a,2); - else - return prob_of_target_and_alignment_given_source(a,2); - } - LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const; - LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ; - LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const; - LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ; - int modelnr()const{return 5;} - LogProb prob_of_target_and_alignment_given_source(const alignment&al, short distortionType=3,bool verb=0)const; - void computeScores(const alignment&al,vector&d)const; -}; -#endif diff --git a/ext/giza-pp/GIZA++-v2/transpair_modelhmm.h b/ext/giza-pp/GIZA++-v2/transpair_modelhmm.h deleted file mode 100644 index d836ad4f..00000000 --- a/ext/giza-pp/GIZA++-v2/transpair_modelhmm.h +++ /dev/null @@ -1,223 +0,0 @@ -/* - -Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI) - -This file is part of GIZA++ ( extension of GIZA ). - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef transpair_modelhmm_h_fjo_defined -#define transpair_modelhmm_h_fjo_defined -#include "Array2.h" -#include "defs.h" -#include "Vector.h" -#include "NTables.h" -#include "ATables.h" -#include "TTables.h" -#include "alignment.h" -#include -#include "transpair_model2.h" -#include "ForwardBackward.h" -#include "hmm.h" - -class transpair_modelhmm : public transpair_model2 -{ - public: - typedef transpair_modelhmm simpler_transpair_model; - HMMNetwork*net; - transpair_modelhmm(const Vector&es, const Vector&fs, const tmodel&tTable, - const amodel&aTable,const amodel&,const nmodel&, - double, double,const hmm*h) - : transpair_model2(es,fs,tTable,aTable),net(h->makeHMMNetwork(es,fs,0)) - {} - ~transpair_modelhmm() { delete net; } - int modelnr()const{return 6;} - LogProb scoreOfMove(const alignment&a, WordIndex _new_i, WordIndex j,double=-1.0)const - { - int new_i=_new_i; - LogProb change=1.0; - int old_i=a(j); - if (old_i == new_i) - change=1.0; - else - { - int theJ=j-1; - old_i--; - new_i--; - int jj=j-1; - while(jj>0&&a(jj)==0) - jj--; - int theIPrev= (jj>0)?(a(jj)-1):0; - if( j>1&&a(j-1)==0 ) - theIPrev+=l; - if( old_i==-1 ){old_i = theIPrev;if(old_igetAlphainit(new_i)/net->getAlphainit(old_i); - } - do - { - if( new_i!=old_i ) - { - change*=net->nodeProb(new_i,theJ)/net->nodeProb(old_i,theJ); - } - if( theJ>0) - change*=net->outProb(theJ,theIPrevNew,new_i)/net->outProb(theJ,theIPrevOld,old_i); - theIPrevOld=old_i; - theIPrevNew=new_i; - theJ++; - if( theJgetBetainit(new_i)/net->getBetainit(old_i); - } - else - { - new_i=a(theJ+1)-1; - if( new_i==-1) - new_i=theIPrevNew; - change*=net->outProb(theJ,theIPrevNew,new_i)/net->outProb(theJ,theIPrevOld,new_i); - } - } - return change; - } - LogProb scoreOfAlignmentForChange(const alignment&)const - {return -1.0; } - LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double=-1.0)const - { - return _scoreOfSwap(a,j1,j2); - } - LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double=-1.0)const - { - alignment b(a); - b.set(j, new_i); - LogProb a_prob=prob_of_target_and_alignment_given_source(a); - LogProb b_prob=prob_of_target_and_alignment_given_source(b); - if( a_prob ) - return b_prob/a_prob; - else if( b_prob ) - return 1e20; - else - return 1.0; - } - LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double=-1.0)const - { - WordIndex aj1=a(j1),aj2=a(j2); - if( aj1==aj2 ) - return 1.0; - LogProb a_prob=prob_of_target_and_alignment_given_source(a); - - /*alignment b(a); - b.set(j1, a(j2)); - b.set(j2, a(j1)); - LogProb b_prob=prob_of_target_and_alignment_given_source(b);*/ - - const_cast(a).set(j1,aj2); - const_cast(a).set(j2,aj1); - LogProb b_prob=prob_of_target_and_alignment_given_source(a); - const_cast(a).set(j1,aj1); - const_cast(a).set(j2,aj2); - - if( a_prob ) - return b_prob/a_prob; - else if( b_prob ) - return 1e20; - else - return 1.0; - } - inline friend ostream&operator<<(ostream&out, const transpair_modelhmm&) - { - return out << "NO-OUTPUT for transpair_modelhmm\n"; - } - LogProb prob_of_target_and_alignment_given_source(const alignment&al,bool verbose=0)const - { - double prob=1.0; - int theIPrev=0; - for(unsigned int j=1;j<=m;j++) - { - int theJ=j-1; - int theI=al(j)-1; - if( theI==-1 ) - theI=(theIPrev%l)+l; - prob*=net->nodeProb(theI,theJ); - if( verbose ) - cout << "NP " << net->nodeProb(theI,theJ) << ' '; - if( j==1 ) - { - prob*=net->getAlphainit(theI); - if( verbose ) - cout << "AP0 " << net->getAlphainit(theI) << ' '; - } - else - { - prob*=net->outProb(theJ,theIPrev,theI); - if( verbose ) - cout << "AP1 " << net->outProb(theJ,theIPrev,theI) << ' '; - } - theIPrev=theI; - if( j==m ) - { - prob*=net->getBetainit(theI); - if( verbose ) - cout << "AP2 " << net->getBetainit(theI) << ' '; - } - if( verbose ) - cout << "j:"<finalMultiply; - } - void computeScores(const alignment&al,vector&d)const - { - double prob1=1.0,prob2=1.0; - int theIPrev=0; - for(unsigned int j=1;j<=m;j++) - { - int theJ=j-1; - int theI=al(j)-1; - if( theI==-1 ) - theI=(theIPrev%l)+l; - prob1*=net->nodeProb(theI,theJ); - if( j==1 ) - { - prob2*=net->getAlphainit(theI); - } - else - { - prob2*=net->outProb(theJ,theIPrev,theI); - } - theIPrev=theI; - if( j==m ) - { - prob2*=net->getBetainit(theI); - } - } - d.push_back(prob1); - d.push_back(prob2); - } - - bool isSubOptimal()const{return 0;} -}; -#endif diff --git a/ext/giza-pp/GIZA++-v2/utility.cpp b/ext/giza-pp/GIZA++-v2/utility.cpp deleted file mode 100644 index 4e9607ae..00000000 --- a/ext/giza-pp/GIZA++-v2/utility.cpp +++ /dev/null @@ -1,30 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "mymath.h" - -double factorial(int n) -{ - double f=1; - for(int i=2; i <= n; i++) - f *= i; - return f; -} diff --git a/ext/giza-pp/GIZA++-v2/utility.h b/ext/giza-pp/GIZA++-v2/utility.h deleted file mode 100644 index 078a2a09..00000000 --- a/ext/giza-pp/GIZA++-v2/utility.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef utility_h -#define utility_h -#include -#include "Perplexity.h" -#include "Vector.h" -#include "TTables.h" -#include "getSentence.h" -#include "vocab.h" - -extern void printHelp(void); -extern void parseConfigFile (char * fname ); -extern void parseArguments(int argc, char *argv[]); -extern void generatePerplexityReport(const Perplexity& trainperp, - const Perplexity& testperp, - const Perplexity& trainVperp, - const Perplexity& testVperp, - ostream& of, int trainsize, - int testsize, unsigned int last, bool); - -extern void printSentencePair(Vector& es, Vector& fs, ostream& of); - -extern void printOverlapReport(const tmodel& tTable, - sentenceHandler& testHandler, vcbList& trainEList, - vcbList& trainFList, vcbList& testEList, vcbList& testFList); - -extern void printAlignToFile(const Vector& es, const Vector& fs, - const Vector& evlist, const Vector& fvlist, - ostream& of2, const Vector& viterbi_alignment, int pair_no, - double viterbi_score); - -extern double factorial(int) ; - -#endif diff --git a/ext/giza-pp/GIZA++-v2/vocab.cpp b/ext/giza-pp/GIZA++-v2/vocab.cpp deleted file mode 100644 index a91c5720..00000000 --- a/ext/giza-pp/GIZA++-v2/vocab.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#include "vocab.h" - -void vcbList::readVocabList() - // reads a vocabulary file from fname. It expects the following format: - // - // token_id token_string frequency -{ - - int freq=0; - WordIndex word_id ; - WordEntry entry("NULL",0) ; - - string line, word ; - cerr << "Reading vocabulary file from:" << fname << "\n"; - // total = 0 ; - ifstream vFile(fname); - if(!vFile){ - cerr << "\nCannot open vocabulary file " << fname << "file"; - exit(1); - } - - list.push_back(entry); - s2i[entry.word]=list.size()-1; - - while(getline(vFile, line)){ - istringstream buffer(line); - if(!(buffer >> word_id >> word >> freq)) - cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl; - if (word_id == 0){ - cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ; - exit(-1); - } - else if (word_id >= MAX_VOCAB_SIZE){ - cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size " - << MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ; - exit(-1); - } - else if (freq < 0){ - cerr << "ERROR: frequency must be a positive integer, in line :\n" - << line <<"\n"; - exit(-1); - } - else if(word_id >= list.size()){ - list.resize(word_id+1); - list[word_id].word = word ; - s2i[word]=word_id; - list[word_id].freq = 0 ; - noUniqueTokens = word_id + 1 ; - // noUniqueTokens++ ; - // total += freq ; - } - else if(list[word_id].word != "\0"){ - cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n" - << line <<"\n"; - cerr << "TOKEN ID " << word_id << " has already been assigned to: " << - list[word_id].word << "\n"; - exit(-1); - } - else { // line has valid information - list[word_id].word = word ; - s2i[word]=word_id; - list[word_id].freq = 0 ; - // noUniqueTokens++ ; - noUniqueTokens = word_id + 1 ; - // total += freq ; - } - } // end of while -} - diff --git a/ext/giza-pp/GIZA++-v2/vocab.h b/ext/giza-pp/GIZA++-v2/vocab.h deleted file mode 100644 index 988edc6c..00000000 --- a/ext/giza-pp/GIZA++-v2/vocab.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - -EGYPT Toolkit for Statistical Machine Translation -Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#ifndef _vocab_h -#define _vocab_h 1 - -#include "defs.h" -#include "Vector.h" - -#include -#include -#include - -class WordEntry { - public: - string word ; - double freq ; - WordEntry():word("\0"), freq(0){}; - WordEntry(string w, int f):word(w), freq(f){}; -}; - -class vcbList{ - private: - Vector list ; - map s2i; - double total; - WordIndex noUniqueTokens ; - WordIndex noUniqueTokensInCorpus ; - const char* fname ; - public: - vcbList(const char* f=0):list(), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f){}; - void setName(const char*f) - { fname=f; } - vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname){}; - inline WordIndex size()const {return (list.size());}; - inline WordIndex uniqTokens()const {return noUniqueTokens;}; - inline WordIndex uniqTokensInCorpus()const {return noUniqueTokensInCorpus;}; - inline double totalVocab() const {return total;}; - inline Vector& getVocabList() { return(list);}; - inline const Vector& getVocabList()const { return(list);}; - void readVocabList(); - void incFreq(WordIndex id , double f){ - if(id < list.size()){ - if (list[id].freq == 0) - noUniqueTokensInCorpus++; - list[id].freq += f ; - total += f ; - } - }; - void clearAllFreq(){ - for (WordIndex id = 0 ; id < list.size() ; id++) - list[id].freq = 0 ; - total = 0 ; - noUniqueTokensInCorpus = 0 ; - }; - int operator()(const string&x)const - { - map::const_iterator i=s2i.find(x); - if( i!=s2i.end() ) - return i->second; - else - { - cerr << "ERROR: no word index for '"< 0) - of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n'; - } - } - -}; - -#endif diff --git a/ext/giza-pp/Makefile b/ext/giza-pp/Makefile deleted file mode 100644 index cb781855..00000000 --- a/ext/giza-pp/Makefile +++ /dev/null @@ -1,20 +0,0 @@ - -.PHONY: gizapp mkcls-v2 install clean - -all: gizapp mkcls-v2 - -gizapp: - @echo $(JOSHUA) - $(MAKE) -C GIZA++-v2 - -mkcls-v2: - @echo $(JOSHUA) - $(MAKE) -C mkcls-v2 - -install: gizapp mkcls-v2 - @cp GIZA++-v2/GIZA++ GIZA++-v2/snt2cooc.out mkcls-v2/mkcls $(JOSHUA)/bin/ - -clean: - $(MAKE) -C GIZA++-v2 clean - $(MAKE) -C mkcls-v2 clean - @rm -f $(JOSHUA)/bin/GIZA++ $(JOSHUA)/bin/mkcls $(JOSHUA)/bin/snt2cooc.out diff --git a/ext/giza-pp/README b/ext/giza-pp/README deleted file mode 100644 index c4b4e347..00000000 --- a/ext/giza-pp/README +++ /dev/null @@ -1,8 +0,0 @@ -This package contains the GIZA++ toolkit and the mkcls tool, originally -written by F.J. Och and several other authors. - -For more information, refer to the README files and the following pages: - http://www.fjoch.com/mkcls.html - http://www.fjoch.com/GIZA++.html - - diff --git a/ext/giza-pp/mkcls-v2/Array.h b/ext/giza-pp/mkcls-v2/Array.h deleted file mode 100644 index ab1f101f..00000000 --- a/ext/giza-pp/mkcls-v2/Array.h +++ /dev/null @@ -1,370 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - -#ifndef ARRAY_H_DEFINED -#define ARRAY_H_DEFINED -using namespace std; -#include "myassert.h" -#include -#include -#include -#include -#include "my.h" - -#define ARRAY_DEBUG - - -template class Array -{ - private: - T *p; - int realSize; - int maxWritten; - char a; - - void copy(T *a,const T *b,int n); - void copy(T *a,T *b,int n); - void _expand(); - - public: - Array() - : p(0),realSize(0),maxWritten(-1) ,a(1) - { -#ifdef VERY_ARRAY_DEBUG - cout << "MAKE ARRAY: " << this<<" "<<(void*)p << endl; -#endif - } - Array(const Array &x) - : p(new T[x.maxWritten+1]),realSize(x.maxWritten+1),maxWritten(x.maxWritten),a(x.a) - { - copy(p,x.p,realSize); -#ifdef VERY_ARRAY_DEBUG - cout << "MAKE ARRAY copy: " << this << " " << realSize <<" "<<(void*)p<< endl; -#endif - } - explicit Array(int n) - : p(new T[n]),realSize(n),maxWritten(n-1),a(0) - { -#ifdef VERY_ARRAY_DEBUG - cout << "MAKE ARRAY with parameter n: " << this << " " << realSize<<" "<<(void*)p << endl; -#endif - } - Array(int n,const T&_init,int _a=0) - : p(new T[n]),realSize(n),maxWritten(n-1),a(_a) - { - for(int iii=0;iii& operator=(const Array&x) - { - if( this!= &x ) - { -#ifdef VERY_ARRAY_DEBUG - cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << endl; -#endif - - delete [] p; - realSize = x.maxWritten+1; - maxWritten = x.maxWritten; - a = x.a; - p = new T[realSize]; - copy(p,x.p,realSize); -#ifdef VERY_ARRAY_DEBUG - cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << endl; -#endif - } - return *this; - } - - Array& operator=(Array&x) - { - if( this!= &x ) - { -#ifdef VERY_ARRAY_DEBUG - cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << endl; -#endif - delete [] p; - realSize = x.maxWritten+1; - maxWritten = x.maxWritten; - a = x.a; - p = new T[realSize]; - copy(p,x.p,realSize); -#ifdef VERY_ARRAY_DEBUG - cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << endl; -#endif - } - return *this; - } - - void allowAccess(int n) - { - while( realSize<=n ) - _expand(); - maxWritten=max(maxWritten,n); - massert( maxWritten()); - } - void init(int n,const T&_init,bool _a=0) - { -#ifdef VERY_ARRAY_DEBUG - cout << "FREE ARRAY because of init: " << this << " " << realSize<<" "<<(void*)p << endl; -#endif - delete []p; - p=new T[n]; - realSize=n; - a=_a; - maxWritten=n-1; - for(int iii=0;iiimaxWritten ) - errorAccess(n); - return p[n]; - } - inline const T& operator[](int n) const - { - if(n<0 || n>maxWritten ) - errorAccess(n); - return p[n]; - } - const T&top(int n=0) const - {return (*this)[maxWritten-n];} - T&top(int n=0) - {return (*this)[maxWritten-n];} - T&push(const T&x) - { - (*this)[maxWritten+1]=x; - return top(); - } - bool writeTo(ostream&out) const - { - out << "Array "; - out << size() << " "; - out << a << endl; - for(int iv=0;iv<=maxWritten;iv++) - { - writeOb(out,(*this)[iv]); - out << endl; - } - return 1; - } - bool readFrom(istream&in) - { - string s; - if( !in ) - { - cerr << "ERROR(Array): file cannot be opened.\n"; - return 0; - } - in >> s; - if( !(s=="Array") ) - { - cerr << "ERROR(Array): Array!='"<> biggest; - in >> a; - resize(biggest); - for(int iv=0;iv bool operator==(const Array &x, const Array &y) -{ - if( &x == &y ) - return 1; - else - { - if( y.size()!=x.size() ) - return 0; - else - { - for(int iii=0;iii bool operator<(const Array &x, const Array &y) -{ - if( &x == &y ) - return 0; - else - { - if( y.size() void Array:: errorAccess(int n) const -{ - cerr << "ERROR: Access to array element " << n - << " (" << maxWritten << "," << realSize << "," << (void*)p << " " << a << ")\n"; - cout << "ERROR: Access to array element " << n - << " (" << maxWritten << "," << realSize << "," << (void*)p << " " << a << ")\n"; - massert(0); -#ifndef DEBUG - abort(); -#endif -} - -template ostream& operator<<(ostream&o,const Array&a) -{ - o << "Array(" << a.size() << "," << a.autoexpand() << "){ "; - for(int iii=0;iii istream& operator>>(istream&in, Array&) -{return in;} - -template int Hash(const Array&a) -{ - int n=0; - for(int iii=0;iii void Array::copy(T *aa,const T *bb,int n) -{ - for(int iii=0;iii void Array::copy(T *aa,T *bb,int n) -{ - for(int iii=0;iii void Array::_expand() -{ -#ifdef VERY_ARRAY_DEBUG - cout << "FREE ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << endl; -#endif - T *oldp=p; - int oldsize=realSize; - realSize=realSize*2+1; - p=new T[realSize]; - copy(p,oldp,oldsize); - delete [] oldp; -#ifdef VERY_ARRAY_DEBUG - cout << "NEW ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << endl; -#endif -} - -template int Array::findMax() const -{ - if( size()==0 ) - return -1; - else - { - int maxPos=0; - for(int iii=1;iii int Array::findMin() const -{ - if( size()==0 ) - return -1; - else - { - int minPos=0; - for(int iii=1;iii -#include -#include - -template -bool writeOb(ostream&out,const T&f) -{ - out << f << " "; - return 1; -} - -template -bool readOb(istream&in,T&f) -{ - in >> f; - char c; - in.get(c); - massert(c==' '); - return 1; -} - -template -bool writeOb(ostream&out,const string &s,const T&f) -{ - out << s << " " << f << " "; - return 1; -} -template -bool readOb(istream&in,const string&s,T&f) -{ - string ss; - in >> ss; - if( s!=ss ) - { - cerr << "ERROR: readOb should be '" << s << "' and is '" << ss << "'" << endl; - return 0; - } - in >> f; - char c; - in.get(c); - massert(c==' '); - return 1; -} - -template class FixedArray -{ - private: - void copy(T *aa,const T *bb,int nnn) - {for(int iii=0;iii &x) - : p(new T[x.realSize]),realSize(x.realSize) {copy(p,x.p,realSize);} - explicit FixedArray(int n) - : p(new T[n]),realSize(n){} - FixedArray(int n,const T&_init) - : p(new T[n]),realSize(n){for(int z=0;z& operator=(const FixedArray&x) - { - if( this!= &x ) - { - delete [] p; - realSize = x.realSize; - p = new T[x.realSize]; - copy(p,x.p,realSize); - } - return *this; - } - void resize(int n) - { - if( n<=realSize ) - shrink(n); - else - { - T*np=new T[n]; - copy(np,p,realSize); - delete []p; - p=np; - realSize=n; - } - } - void shrink(int n) - { - assert(n<=realSize); - realSize=n; - } - void init(int n,const T&_init) - { - delete []p; - p=new T[n]; - realSize=n; - for(int l=0;l> s; - if( !(s=="FixedArray") ) - { - cerr << "ERROR(FixedArray): FixedArray!='"<> biggest; - resize(biggest); - for(int a=0;a()); - } - int binary_locate(const T&t) - { - T*ppos=std::lower_bound(p,p+size(),t); - int pos=ppos-p; - if( pos>=-1&&pos=0&&pos bool operator<(const FixedArray &x, const FixedArray &y) -{ - return lexicographical_compare(x.begin(),x.end(),y.begin(),y.end()); - -} - - -template bool operator==(const FixedArray &x, const FixedArray &y) -{ - if( &x == &y )return 1; - const int s = x.size(); - if( s !=y.size() )return 0; - for(int iii=0;iii int Hash(const FixedArray&a) -{ - int n=0; - const int s=a.size(); - for(int iii=0;iii void FixedArray:: errorAccess(int n) const -{ - massert(0); - cerr << "ERROR: Access to array element " << n - << " (" << realSize << "," << (void*)p << ")\n"; -} - -template ostream& operator<<(ostream&o,const FixedArray&a) -{ - o << "FixedArray(" << a.size() << "){ "; - for(int iii=0;iii istream& operator>>(istream&in, FixedArray&) -{ return in;} - -template FixedArray operator+(const FixedArray&a,const FixedArray&b) -{ - massert(a.size()==b.size()); - FixedArray x(a.size()); - for(int iii=0;iii FixedArray operator|(const FixedArray&aaa,const FixedArray&bbb) -{ - iassert(aaa.size()==bbb.size()); - - FixedArray xxx(aaa.size()); - for(int iii=0;iii -class FlexArray -{ -private: - FixedArray p; - int start,end; -public: - FlexArray(int _start=0,int _end=-1) - : p(_end-_start+1),start(_start),end(_end) {} - T&operator[](int i) - {return p[i-start];} - const T&operator[](int i)const - {returnp[i-start];} - int low()const{return start;} - int high()const{return end;} -}; - - -#endif diff --git a/ext/giza-pp/mkcls-v2/GDAOptimization.cpp b/ext/giza-pp/mkcls-v2/GDAOptimization.cpp deleted file mode 100644 index a9e2fa71..00000000 --- a/ext/giza-pp/mkcls-v2/GDAOptimization.cpp +++ /dev/null @@ -1,159 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include "GDAOptimization.h" -#include "ProblemTest.h" -#include - -#define GDAOptimization GDAOptimization -#define IterOptimization IterOptimization - - - -double GDAOptimization::defaultTemperatur=1e100; - - -double GDAOptimization::defaultAlpha=0.001; - - - -GDAOptimization::GDAOptimization(Problem &p,int m) -: IterOptimization(p,m) ,temperatur(defaultTemperatur),alpha(defaultAlpha) -{ -} - - -GDAOptimization::GDAOptimization(Problem &p,double t,double a,int m) -: IterOptimization(p,m) ,temperatur(t) ,alpha(a) -{ -} - - -GDAOptimization::GDAOptimization(GDAOptimization &o) -: IterOptimization(o) -{ - temperatur = o.temperatur; - alpha = o.alpha; - gdaEndFlag = o.gdaEndFlag; -} - - -void GDAOptimization::zInitialize() -{ - IterOptimization::zInitialize(); - if(temperatur==1e100) - { - double v=problem.value(); - - - - - - temperatur=v; - } - assert(alpha>=0); -} - -short GDAOptimization::accept(double delta) -{ - if( curValue + delta < temperatur ) - return 1; - else - return 0; -} - -void GDAOptimization::abkuehlen() -{ - double newTemperatur = temperatur - alpha*(temperatur - curValue); - if( fabs(temperatur - newTemperatur)<1e-30 ) - gdaEndFlag=1; - else - gdaEndFlag=0; - temperatur = newTemperatur; -} - -short GDAOptimization::end() -{ - return ( endFlag>0 ) && ( gdaEndFlag ); -} - -void GDAOptimization::makeGraphOutput() -{ - IterOptimization::makeGraphOutput(); - *GraphOutput << temperatur-curValue; -} - - - - -double GDAOptimization::optimizeValue(Problem &p,int proParameter,int numParameter,int typ, - int optimierungsschritte,int print) -{ - if(typ!=1) - { - cerr << "Error: wrong parameter-type in GDAOptimization::optimizeValue (" - << typ << ")\n"; - exit(1); - } - else - { - double bestPar=-1,best=1e100; - double now; - if( print ) - cout << "#GDA-optimizeValues: " << numParameter<now ) - { - best=now; - bestPar=defaultAlpha; - } - if( print ) - { - cout << defaultAlpha <<" "; - cout << end.getMean() << " " << end.quantil(0.2) << " " - << end.quantil(0.79) << " " << laufzeit.getMean() << " " - << end.quantil(0.0) << " " << end.getSigma() << " " - << end.getSigmaSmaller()<< " "<< end.getSigmaBigger()<< endl; - } - } - if( print ) - cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit" - " Bester Sigma SigmaSmaller SigmaBigger\n"; - defaultAlpha=0.03; - return bestPar; - } - return 1e100; -} - diff --git a/ext/giza-pp/mkcls-v2/GDAOptimization.h b/ext/giza-pp/mkcls-v2/GDAOptimization.h deleted file mode 100644 index 33bcec38..00000000 --- a/ext/giza-pp/mkcls-v2/GDAOptimization.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - -#ifndef GDAOPTIMIZATION -#define GDAOPTIMIZATION -#include "IterOptimization.h" - -class GDAOptimization : public IterOptimization -{ - - private: - double temperatur; - double alpha; - short gdaEndFlag; - - - protected: - virtual void zInitialize(); - - - virtual short accept(double delta); - - - virtual void abkuehlen(); - - - virtual short end(); - - - virtual void makeGraphOutput(); - - - public: - GDAOptimization(Problem &p,double temperatur,double alpha, - int maxIter=-1); - - - GDAOptimization(Problem &p,int maxIter=-1); - - - GDAOptimization(GDAOptimization &o); - - - static double optimizeValue(Problem &p,int proParameter, - int numParameter,int typ,int schritte= -1,int verbose=1); - - - - static double defaultTemperatur; - static double defaultAlpha; - -}; -#endif - - diff --git a/ext/giza-pp/mkcls-v2/GNU.GPL b/ext/giza-pp/mkcls-v2/GNU.GPL deleted file mode 100644 index 5b2225e4..00000000 --- a/ext/giza-pp/mkcls-v2/GNU.GPL +++ /dev/null @@ -1,282 +0,0 @@ - - -Preamble - -The licenses for most software are designed to take away your freedom -to share and change it. By contrast, the GNU General Public License is -intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Library General Public License instead.) You can apply it to -your programs, too. - -When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - -To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the software, or if you modify it. - -For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - -We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - -Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, -we want its recipients to know that what they have is not the -original, so that any problems introduced by others will not reflect -on the original authors' reputations. - -Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at -all. - -The precise terms and conditions for copying, distribution and -modification follow. - - -TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - -0. This License applies to any program or other work which contains a -notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the Program -(independent of having been made by running the Program). Whether that -is true depends on what the Program does. - -1. You may copy and distribute verbatim copies of the Program's source -code as you receive it, in any medium, provided that you conspicuously -and appropriately publish on each copy an appropriate copyright notice -and disclaimer of warranty; keep intact all the notices that refer to -this License and to the absence of any warranty; and give any other -recipients of the Program a copy of this License along with the -Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a -fee. - -2. You may modify your copy or copies of the Program or any portion of -it, thus forming a work based on the Program, and copy and distribute -such modifications or work under the terms of Section 1 above, -provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that - in whole or in part contains or is derived from the Program or - any part thereof, to be licensed as a whole at no charge to all - third parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you - provide a warranty) and that users may redistribute the program - under these conditions, and telling the user how to view a copy - of this License. (Exception: if the Program itself is interactive - but does not normally print such an announcement, your work based - on the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - -3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of - Sections 1 and 2 above on a medium customarily used for software - interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - -4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt otherwise -to copy, modify, sublicense or distribute the Program is void, and -will automatically terminate your rights under this License. However, -parties who have received copies, or rights, from you under this -License will not have their licenses terminated so long as such -parties remain in full compliance. - -5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - -6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted -herein. You are not responsible for enforcing compliance by third -parties to this License. - - -7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - -8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - -9. The Free Software Foundation may publish revised and/or new -versions of the General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Program does not specify a -version number of this License, you may choose any version ever -published by the Free Software Foundation. - -10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the -author to ask for permission. For software which is copyrighted by the -Free Software Foundation, write to the Free Software Foundation; we -sometimes make exceptions for this. Our decision will be guided by the -two goals of preserving the free status of all derivatives of our free -software and of promoting the sharing and reuse of software generally. - -NO WARRANTY - -11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE -LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS -AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF -ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - -12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - -END OF TERMS AND CONDITIONS diff --git a/ext/giza-pp/mkcls-v2/HCOptimization.cpp b/ext/giza-pp/mkcls-v2/HCOptimization.cpp deleted file mode 100644 index 0c6a729a..00000000 --- a/ext/giza-pp/mkcls-v2/HCOptimization.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include "HCOptimization.h" - -HCOptimization::HCOptimization(Problem &p,int m) -: IterOptimization(p,m) -{ - if( maxStep<=0 ) - maxStep=(int)(problem.expectedNumberOfIterations()); -} -HCOptimization::HCOptimization(HCOptimization &o) -: IterOptimization(o) -{ -} - - -short HCOptimization::accept(double delta) -{ - if( delta < 0 ) - return 1; - else - return 0; -} -short HCOptimization::end() -{ - return endFlag>0; -} -void HCOptimization::abkuehlen() -{ -} - - - diff --git a/ext/giza-pp/mkcls-v2/HCOptimization.h b/ext/giza-pp/mkcls-v2/HCOptimization.h deleted file mode 100644 index ec147b22..00000000 --- a/ext/giza-pp/mkcls-v2/HCOptimization.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - -#ifndef HCOPTIMIZATION -#define HCOPTIMIZATION -#include "IterOptimization.h" - -class HCOptimization : public IterOptimization -{ - - protected: - virtual short accept(double delta); - - - virtual void abkuehlen(); - - - virtual short end(); - - - public: - HCOptimization(Problem &p,int maxIter=-1); - - - HCOptimization(HCOptimization &o); - - -}; -#endif diff --git a/ext/giza-pp/mkcls-v2/IterOptimization.cpp b/ext/giza-pp/mkcls-v2/IterOptimization.cpp deleted file mode 100644 index 258cb1fb..00000000 --- a/ext/giza-pp/mkcls-v2/IterOptimization.cpp +++ /dev/null @@ -1,199 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - -#include "IterOptimization.h" -#include "ProblemTest.h" - -ostream *GraphOutput; - - - -IterOptimization::IterOptimization(Problem& p,int m) - : maxNonBetterIterations(0),problem(p),maxStep(m),initialisiert(0) -{ -} - - - -IterOptimization::IterOptimization(IterOptimization& o) : Optimization(),problem(o.problem) -{ - maxNonBetterIterations=o.maxNonBetterIterations; - curValue = o.curValue; - bestStep = o.bestStep; - bestValue = o.bestValue; - maxStep = o.maxStep; - initialisiert = o.initialisiert; - endFlag = o.endFlag; - endFlag2 = o.endFlag2; -} - - - -double IterOptimization::minimize(int steps) -{ - if( !initialisiert ) - zInitialize(); - - if( steps==0 ) - return curValue; - - int t=0; - int every=(steps<0)?10000:(steps/1000+1); - - do - { - curStep++; - t++; - if(verboseMode&&(curStep%1000==0)) - { - if(steps>0) - cout << "Processed: " << 100.0*(curStep/(double)max(maxStep,1)) << " percent. (IterOptimization run) " - << curValue << " max:" << maxStep << " " << steps << " \r"; - else - cout << "In step:" << curStep << " currentValue: " << curValue - << " bestValue: " << bestValue-curValue << " " << curStep-bestStep << ". \r"; - cout.flush(); - } - - - ProblemChange *change= &(problem.change()); - - - double delta=problem.valueChange(*change); - - - abkuehlen(); - - - if( accept(delta) ) - { - - problem.doChange(*change); - - - curValue+=delta; - - - if( curValue1 ) - cout<<"in step: "<maxNonBetterIterations && maxNonBetterIterations>0) - endFlag=1; - if(curStep - bestStep>2*maxNonBetterIterations && maxNonBetterIterations>0) - endFlag2=1; - - - - if( GraphOutput&&((curStep%every)==0) ) - { - makeGraphOutput(); - *GraphOutput<<" "< -#include -#include "KategProblem.h" -#include "KategProblemTest.h" - -#include "ProblemTest.h" - -extern double SigmaVerfaelschung; - -double h_table[MAX_H_TABLE],l_table[MAX_H_TABLE],hmy_table[MAX_H_TABLE],hmy_sigma; - -double LWRW_Faktor=0.5; - -static int intcompare(const void *p,const void *j) -{ - return *(int *)p - *(int *)j; -} - -KategProblem::KategProblem(int aw,int mak,int _initialisierung,int _auswertung, - int _nachbarschaft,int mindestAnzahl) -: Problem(mak,aw,_initialisierung,_auswertung,_nachbarschaft), - sigmaVerfaelschung(SigmaVerfaelschung),katWasEmpty(0),nwg(mak+2),ngw(mak+2),_katOfWord(aw,-1),words(0),kats(0), - wordFreq(aw,mindestAnzahl),katFreq(mak+2,(_auswertung==CRITERION_MY)?SigmaVerfaelschung:0.0), - initLike(aw,-1) - -{ - if( auswertung == CRITERION_MY ) - cout << "Sigma-Verfaelschung: " << sigmaVerfaelschung << endl; - _maxComp=aw; - _maxCompVal=mak; - massert(katFreq.nKats>0); - massert(mak<=aw); - - - for(int i=1;i2)cout << "KategProblem::_initialize(INIT_OTHER)\n"; - for(i=0;i2)cout << "KategProblem::_initialize(INIT_RAN)\n"; - for(i=0;i0 && wordFreq.maxIndex[i]>0 ) - fastPutWord(i,wordFreq.minIndex[i]+randomInt(wordFreq.maxIndex[i]-wordFreq.minIndex[i]+1)); - else - fastPutWord(i,2+randomInt(katFreq.nKats-2)); - } - - - break; - case INIT_AIO: - - if(verboseMode>2)cout << "KategProblem::_initialize(INIT_AIO)\n"; - for(i=0;i2)cout << "KategProblem::_initialize(INIT_FREQ)\n"; - for(i=0;i=katFreq.nKats ) - to=katFreq.nKats-1; - fastPutWord((*(wordFreq.absteigend))[i],to); - } - curComp=katFreq.nKats-2; - break; - case INIT_LWRW: - - { - Array markList(wordFreq.nWords,1); - int to=2; - int i=0; - if(verboseMode>2)cout << "KategProblem::_initialize(INIT_LWRW)\n"; - for(to=2;to0 ) - to++; - } - for(i=0;i& aft=wordFreq.after[word]; - - int nAft=aft.size(); - - for(i=0;i2) - { - cout << "\nInitialization of KategProblem:"; - dumpOn(cout); - } -} - -double KategProblem::valueChange(ProblemChange&c) - -{ - numberOfPartEvaluations++; - KategProblemChange &k=*(KategProblemChange *)&c; - fillNWG(k.word); - - return _valueChange(k); -} - - -Problem *KategProblem::makeEqualProblem() - -{ - KategProblem*p = new KategProblem(wordFreq.nWords,katFreq.nKats-2,initialisierung, - auswertung,nachbarschaft); - KategProblemWBC &w=p->wordFreq; - for(int x=0;xwords = new leda_array(*words); - for(i=0;isetKatOfWord(i,katOfWord(i)); - p->initLike[i]=initLike[i]; - } - p->setValuesFrom(this); - return p; -} - -double KategProblem::nicevalue(double val) - -{ - double v; - if( val!=1e100) - v=val; - else - v=value(); - double h=wordFreq.get_h_of_words(); - double n=wordFreq.numberOfWords(); - double k=0; - if(auswertung == CRITERION_MY) - k=katFreq.myCriterionTerm(); - return exp((v+h-k)/n); -} - -void KategProblem::makeKats() - -{ - if(kats)delete kats; - kats = new leda_array(katFreq.nKats); - for(int i=0;i&theSet = (*kats)[i]; - if( words==0 ) - { - int nr=0; - forall_set(leda_set,nr,theSet) - { - *PrintBestTo2 << nr << ", "; - printed=1; - } - } - else - { - int nr=0; - forall_set(leda_set,nr,theSet) - { - *PrintBestTo2 << (*words)[nr]<< ","; - printed=1; - } - } - if(printed==1)anzkat++; - *PrintBestTo2 << endl; - } - *PrintBestTo2 << ";I have " << anzkat << " categories used.\n"; - } - *PrintBestTo2 << endl; - Problem::dumpOn(*PrintBestTo2); - } -} - - - - - - -const char *KategProblem::getString(int i) - -{ - if(words==0) - return "<>"; - else - return ((*words)[i]).c_str(); -} - -string KategProblem::getTheString(int i) -{ - return (*words)[i]; -} - -int KategProblem::maxNonBetterIterations() - -{ - if(katwahl()==K_BEST) - return wordFreq.nTranspWords; - else - return katFreq.nKats*wordFreq.nTranspWords; -} - -int KategProblem::expectedNumberOfIterations() - -{ - - if(katwahl()==K_BEST) - return 10*wordFreq.nTranspWords; - else - return 13*katFreq.nKats*wordFreq.nTranspWords; -} - -void KategProblem::makeTitle(char x[512]) - -{ - const char *ww; - const char *kw; - const char *in; - switch(wortwahl()) - { - case W_RAN: - ww="zufaellig"; - break; - case W_DET_DECR: - ww="absteigend"; - break; - case W_DET_INCR: - ww="aufsteigend"; - break; - default: - cerr << "Error: unknown word selection\n"; - exit(1); - } - switch(katwahl()) - { - case K_DET: - kw="rotierend"; - break; - case K_RAN: - kw="zufaellig"; - break; - case K_BEST: - kw="best "; - break; - default: - cout << "Error: unknown cagegory selection\n"; - exit(1); - } - switch(initialisierung) - { - case INIT_RAN: - in="zufaellig "; - break; - case INIT_AIO: - in="all-in-one"; - break; - case INIT_LWRW: - in="lwrw "; - break; - case INIT_FREQ: - in="freq "; - break; - case INIT_OTHER: - in="other "; - break; - default: - cout << "Error: unknown initialization\n"; - exit(1); - } - sprintf(x,"(c:%d,w:%d(%d),ww:%s,kw:%s,in:%s)",katFreq.nKats,wordFreq.nWords, - wordFreq.nTranspWords,ww,kw,in); -} - - - - -int KategProblem::_change(ProblemChange **p) - -{ - *p=0; - int word=curDimension(); - switch( wortwahl() ) - { - case W_RAN: - word=(*(wordFreq.absteigend))[randomInt(wordFreq.nTranspWords)]; - break; - case W_DET_DECR: - word=(*(wordFreq.absteigend))[word]; - break; - case W_DET_INCR: - word=(*(wordFreq.absteigend))[wordFreq.nTranspWords-word-1]; - break; - default: - cerr << "Error: Unknown word selection\n"; - exit(1); - } - - int kat=curDimensionVal()+2; - switch( katwahl() ) - { - case K_RAN: - kat=randomInt(katFreq.nKats-2)+2; - - case K_DET: - - - if( kat==katOfWord(word)||(katWasEmpty&&katFreq.n1(kat)==0) ) - return 0; - else if( wordFreq.minIndex[word]>0 && wordFreq.maxIndex[word]>0 && (katwordFreq.maxIndex[word])) - { - - return 0; - } - else - { - KategProblemChange *c = new KategProblemChange; - c->toKat=kat; - c->word=word; - c->fromKat=katOfWord(c->word); - massert( c->toKat < katFreq.nKats ); - massert( c->fromKat < katFreq.nKats ); - massert( c->word < wordFreq.nWords ); - massert( c->toKat!=0 && c->toKat!=1 ); - massert( c->fromKat!=0 && c->fromKat!=1 ); - if(katFreq.n1(kat)==0) - katWasEmpty=1; - *p=c; - return 1; - } - break; - case K_BEST: - { - fillNWG(word); - double smallest=1e100; - KategProblemChange &smallestChange = *new KategProblemChange; - short withEmpty=0; - - - int startKat=2; - int endKat=katFreq.nKats; - if( wordFreq.minIndex[word]>0&&wordFreq.maxIndex[word]>0 ) - { - startKat = max(2,wordFreq.minIndex[word]); - endKat = min(katFreq.nKats,wordFreq.maxIndex[word]+1); - } - for(kat=startKat;kat0 ) - return n*log(tf); - else - return 0.0; -} - -double mkat_h_part(int n,double cf) -{ - - - if( cf>0.0 ) - return n*log(cf); - else - return 0.0; -} - -double KategProblem::kat_h_full(int n) -{ - return mkat_h_full(n,verfaelsche(n,sigmaVerfaelschung)); -} -double KategProblem::kat_h_full(double n) -{ - abort(); - return mkat_h_full((int)n,verfaelsche(n,sigmaVerfaelschung)); -} - -double KategProblem::kat_h_part(int n) -{ - return mkat_h_part(n,verfaelsche(n,sigmaVerfaelschung)); -} -double KategProblem::kat_h_part(double n) -{ - abort(); - return mkat_h_part((int)n,verfaelsche(n,sigmaVerfaelschung)); -} - - - - -double KategProblem::nmo_my(int i,int j) - -{ - FreqType n=nstrich(i,j),k=katFreq.n(i,j); - return kat_h_full(n+k)-kat_h_full(k); -} -double KategProblem::nmo(int i,int j) - -{ - FreqType n=nstrich(i,j),k=katFreq.n(i,j); - return kat_h(n+k)-kat_h(k); -} -double KategProblem::nmo_lo(int i,int j,int &e0,int &e1) - -{ - FreqType kij=katFreq.n(i,j); - FreqType nij=nstrich(i,j)+kij; - if( kij!=nij) - { - if( nij==0 ) - e0++; - else if(nij==1) - e1++; - if( kij==0 ) - e0--; - else if(kij==1) - e1--; - } - return nij*kat_mlog(nij-1-rhoLo)-kij*kat_mlog(kij-1-rhoLo); -} - - -double KategProblem::_valueChange(KategProblemChange &k) - -{ - double v=0; - int i=0; - - ursprung=k.fromKat; - ziel=k.toKat; - - if( auswertung==CRITERION_LO ) - { - int e0a=katFreq.eta0,e1a=katFreq.eta1; - v-=nmo_lo(ursprung,ursprung,e0a,e1a)+nmo_lo(ziel,ziel,e0a,e1a) - +nmo_lo(ursprung,ziel,e0a,e1a)+nmo_lo(ziel,ursprung,e0a,e1a); - i=0; - while(i0 && katFreq.n1(ursprung)==wordFreq.n1(k.word) ) - nc1_0++; - if( wordFreq.n2(k.word)>0 && katFreq.n2(ursprung)==wordFreq.n2(k.word) ) - nc2_0++; - if( wordFreq.n1(k.word)>0 && katFreq.n1(ziel)==0 ) nc1_0--; - if( wordFreq.n2(k.word)>0 && katFreq.n2(ziel)==0 ) nc2_0--; - int new0=nc1_0*katFreq.nKats+nc2_0*katFreq.nKats-nc1_0*nc2_0; - v-=kat_etaFkt(e0a,e1a,new0,katFreq.nKats) - -kat_etaFkt(katFreq.eta0,katFreq.eta1,old0,katFreq.nKats); - vassert(NULLFLOAT(Problem::valueChange(k)-v)); - } - else if(auswertung==CRITERION_ML) - { - v-=nmo(ursprung,ursprung)+nmo(ziel,ziel) - +nmo(ursprung,ziel)+nmo(ziel,ursprung); - i=0; - while(i2) - cout << "ZUSATZ: " << bishZusatz << " " << neuZusatz << " " < &after=wordFreq.after[w]; - int size=after.size(),i; - nww=0; - nwg.init(); - for(i=0;i &before=wordFreq.before[w]; - size=before.size(); - ngw.init(); - for(i=0;i=0 && toKat=0 ) - toKat=wordFreq.fixedWord[word]; - massert(katOfWord(word)==-1); - setKatOfWord(word,toKat); -} - -void KategProblem::fixInitLike() -{ - int fixed=0,fixed2=0; - over_arr(initLike,i) - if(initLike[i]>=0 ) - { - fixed++; - if( initLike[i]>=wordFreq.minIndex[i] || initLike[i]==1 ) - wordFreq.fixedWord[i]=initLike[i]; - else - { - wordFreq.fixedWord[i]=wordFreq.minIndex[i]+initLike[i]-2; - fixed2++; - } - initLike[i]=-1; - } - cout << "Fixed from file are: " << fixed << " " << fixed2 << " words.\n"; -} - -void KategProblem::putWord(int word,int toKat) - -{ - massert(toKat!=0);massert(toKat!=1); - massert(word& aft=wordFreq.after[word]; - Array& bef=wordFreq.before[word]; - int nAft=aft.size(); - int nBef=bef.size(); - int i; - if(verboseMode>4) - cout << "putWord(" << word << "," << toKat << ")" << k << " nAft" - << nAft << " nBef" << nBef << " k" << k << "\n"; - - massert( k!=-1 ); - massert( k!=toKat ); - - for(i=0;i4) - cout << k << " " << katOfWord(aft[i].w) << " " << -aft[i].n << endl; - } - for(i=0;i4) - cout << katOfWord(bef[i].w) << " " << k << " " << -bef[i].n << endl; - } - - setKatOfWord(word,toKat); - - for(i=0;i0); - massert(anzKategProblemChange<2); - if( anzKategProblemChange==1 ) - return &theOneKategProblemChange; - else - { - if( verboseMode>1 ) - cout << "generate instance of KategProblemChange: " << size - << " " << anzKategProblemChange<< endl; - return malloc(size); - } -} -void KategProblemChange::operator delete(void *ptr,size_t -) -{ massert(size==sizeof(KategProblemChange)); - anzKategProblemChange--; - if( ptr!= &theOneKategProblemChange) - free(ptr); -} - - - - - - - - - - - - -NWG::NWG(int n) : freq(n,0),timeOfFreq(n,0),not0(n),word(-1) -{ - massert(n>0); - curTime=1; - init(); -} - -void NWG::init() -{ - curTime++; - anzNot0=0; -} - -void NWG::sort() -{ - qsort(not0.getPointerToData(),anzNot0,sizeof(int),intcompare); - massert(anzNot0<=not0.size()); -} - - -int KategProblem::maxDimension() -{ - return _maxComp; -} - -int KategProblem::maxDimensionVal() -{ - return _maxCompVal; -} - diff --git a/ext/giza-pp/mkcls-v2/KategProblem.h b/ext/giza-pp/mkcls-v2/KategProblem.h deleted file mode 100644 index e5a5a461..00000000 --- a/ext/giza-pp/mkcls-v2/KategProblem.h +++ /dev/null @@ -1,439 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - -#ifndef KATEG_OPT_H -#define KATEG_OPT_H -#include - -#include -#include "Problem.h" - -extern double rhoLo; - -typedef int Kategory; -typedef int Word; - - - -#ifdef FREQTYPE_DOUBLE -typedef double FreqType; -#else -typedef int FreqType; -#endif - - -#include "KategProblemWBC.h" - - -#include "KategProblemKBC.h" - - -enum { - INIT_RAN=1, - INIT_AIO=2, - INIT_LWRW=3, - INIT_FREQ=4, - INIT_OTHER=5 - }; - - -enum { - W_RAN=(8|16), - W_DET_DECR=(16), - W_DET_INCR =(32) -}; -#define CHOOSE_WORD (8|16|32) - - -enum { - K_DET=(64), - K_RAN=(128), - K_BEST=(64|128) -}; -#define CHOOSE_KAT (64|128) - - -enum { - CRITERION_ML=0, - CRITERION_LO=1, - CRITERION_MY=2 -}; - - - -class NWG -{ - private: - Array freq; - - Array timeOfFreq; - - - - - int curTime; - public: - NWG(int n); - void init(); - - int anzNot0; - - - Array not0; - - int word; - - inline void addFreq(int C,FreqType n); - - void sort(); - - FreqType getFreq(int i) - { - if( timeOfFreq[i]==curTime ) - return freq[i]; - else - return 0; - }; -}; - -inline void NWG::addFreq(int g,FreqType n) -{ - if(timeOfFreq[g]==curTime) - freq[g]+=n; - else - { - timeOfFreq[g]=curTime; - freq[g]=n; - not0[anzNot0++]=g; - } -} - - - -struct KategProblemChange : public ProblemChange -{ - void *operator new(size_t size); - void operator delete(void *ptr,size_t size); - - int word; - int toKat; - int fromKat; -}; - -class KategProblem : public Problem -{ - private: - double kat_h_full(int n); - double kat_h_full(double n); - double kat_h_part(int n); - double kat_h_part(double n); - double sigmaVerfaelschung; - short katWasEmpty; - - - - int nwgWord; - - NWG nwg; - NWG ngw; - FreqType nww; - - int ursprung,ziel; - - Array _katOfWord; - - int _maxComp,_maxCompVal; - - double nmo_my(int i,int j); - double nmo(int i,int j); - - - double nmo_lo(int i,int j,int &e0,int &e1); - - - void putWord(int word,int to); - - - void fastPutWord(int word,int to); - - - void setKatOfWord(int w,int k) -{ - if( !(wordFreq.fixedWord[w]==k||wordFreq.fixedWord[w]==-1||k==-1) ) - { - cout << "mkcls::setKatOfWord::ERROR: " << w << " " << k << " " << wordFreq.fixedWord[w] << " " << (*words)[w] << endl; - } - _katOfWord[w]=k; - nwgWord=-1; -}; - - - void fillNWG(int w); - - - inline FreqType nstrich(int i,int j); - - - void vnstrich(int i,int j); - - - - protected: - virtual int _change(ProblemChange **p); - - - virtual void _doChange(ProblemChange &c); - - - virtual void _undoChange(ProblemChange &c); - - - virtual double _value(); - - - double _valueChange(KategProblemChange &k); - - - virtual void incrementDirection(); - - - virtual int maxDimensionVal(void) ; - - - virtual int maxDimension(void) ; - - -public: - leda_array *words; -typedef leda_set intSet; - -leda_array *kats; - - KategProblemWBC wordFreq; - KategProblemKBC katFreq; - - Array initLike; - - KategProblem(int aw,int mak,int _initialisierung,int _auswertung, - int _nachbarschaft,int minw=0); - - - virtual ~KategProblem(); - - - virtual void _initialize(int initTyp); - virtual void _initialize(int initTyp,int specialFixedWord); - - - virtual double valueChange(ProblemChange&c); - - - virtual Problem *makeEqualProblem(); - - - virtual double nicevalue(double value=1e100); - - - void makeKats(); - - - virtual void dumpOn(ostream &strm); - - - virtual void dumpInfos(ostream &strm); - - - - - - inline void katwahl(int k); - - - inline void wortwahl(int w); - - - - - - inline int katOfWord(int w); - - - inline short wortwahl(); - - - inline short katwahl() ; - - - virtual int maxNonBetterIterations(); - - - virtual int expectedNumberOfIterations(); - - - const char *getString(int i); - string getTheString(int i); - - - void makeTitle(char x[512]); - - - void fixInitLike(); - -}; - -inline int KategProblem::katOfWord(int w){return _katOfWord[w];}; -inline short KategProblem::wortwahl(){return nachbarschaft&CHOOSE_WORD;}; -inline short KategProblem::katwahl() {return nachbarschaft&CHOOSE_KAT;}; - -inline void KategProblem::katwahl(int k) - { - nachbarschaft = (nachbarschaft&(~CHOOSE_KAT)) | k; - if(k==K_BEST) - _maxCompVal=1; - else - _maxCompVal=katFreq.nKats-2; - }; - -inline void KategProblem::wortwahl(int w) - { - nachbarschaft = (nachbarschaft&(~CHOOSE_WORD)) | w; - }; - - - -inline FreqType KategProblem::nstrich(int i,int j) -{ - FreqType n=0; - - if( i==ursprung ) - n-=nwg.getFreq(j); - if( i==ziel ) - n+=nwg.getFreq(j); - - if( j==ursprung ) - n-=ngw.getFreq(i); - if( j==ziel ) - n+=ngw.getFreq(i); - - if( i==ursprung && j==ursprung ) - n+=nww; - if( i==ziel && j==ziel ) - n+=nww; - - if( i==ursprung && j==ziel ) - n-=nww; - if( i==ziel && j==ursprung ) - n-=nww; - - return n; -} - - - - - -#define MAX_H_TABLE 4000 -extern double h_table[],l_table[],hmy_table[],hmy_sigma; - - -inline double kat_mlog(double x) -{ - if(x<=1e-9) - return 0; - else - return log(x); -} - - -inline double kat_mlog(int s) -{ - if(s<=0) - return 0; - else if( s=-1); - if(n<=0) - return 0; - else - if(n1) - return e1*log( (ePlus-1.0)/(e0+1.0)*rhoLo ); - else - return 0; -} - -double mkat_h_full(int n,double tf); -double mkat_h_part(int n,double cf); - -int Hash(const string& s); - - -#endif - diff --git a/ext/giza-pp/mkcls-v2/KategProblemKBC.cpp b/ext/giza-pp/mkcls-v2/KategProblemKBC.cpp deleted file mode 100644 index 97c40fc7..00000000 --- a/ext/giza-pp/mkcls-v2/KategProblemKBC.cpp +++ /dev/null @@ -1,243 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include -#include "KategProblem.h" - -double rhoLo=0.75; -#define MAX_VERFAELSCHUNG 5000 -double verfTab[MAX_VERFAELSCHUNG],verfTabSigma=-1.0; -double verfaelsche(int a,double b) -{ - - if( a>=0&&verfTabSigma==b&&a LANGSAMER!!!\n"; - } - for(i=0;i1 ) - { - cout << "CRITERION_MY: " << bewertung << endl; - cout << "U1:"<<_unigramVerfSum1 << " n:"<0.000001) - { - - - if(verboseMode>1 ) - cout << " NEU: " <<_nWords*log( u1 * u2 / b ) << endl; - bewertung -= _nWords*log( u1 * u2 / b ); - if(verboseMode>1) - cout << "SCHLUSSBEWERTUNG: " << bewertung << endl; - } - else - cout << "B zu klein " << b << endl; - } - break; - case CRITERION_LO: - for(c1=0;c11 ) - { - cout << "nwords divisor:"<<_nWords << " " << u1 * u2 / b << endl; - cout << "ergebnis: "<<_nWords*log( u1 * u2 / b ) << endl; - cout << "0: "< FreqArray; -typedef Array FreqArrayReal; - - -double verfaelsche(int a,double b); -double verfaelsche(double a,double b); - -class KategProblemKBC - - -{ - friend class KategProblem; - - private: - Array _n; - Array _n1; - - Array _n2; - - - double sigmaVerfaelschung; - short withVerfaelschung; - - Array _nverf; - Array _n1verf; - Array _n2verf; - FreqType _nWords; - - protected: - int eta0; - int eta1; - int c1_0; - int c2_0; - double _bigramVerfSum; - double _unigramVerfSum1; - double _unigramVerfSum2; - double verfInit0; - - public: - int nKats; - - KategProblemKBC(int nKats,double sv); - - - double fullBewertung(int auswertung); - - - FreqType n(int w1,int w2) { return _n[w1][w2]; }; - - - FreqType n1(int w) { return _n1[w];}; - - - FreqType n2(int w) { return _n2[w];}; - - - double bigramVerfSum(); - double unigramVerfSum1(); - double unigramVerfSum2(); - - double nverf(int w1,int w2) { return _nverf[w1][w2]; } - - double n1verf(int w) { return _n1verf[w]; }; - - double n2verf(int w) { return _n2verf[w]; }; - - inline void addN(int w1,int w2, FreqType n); - - - void setN(int w1,int w2, FreqType n); - - - double myCriterionTerm(); - -}; - -inline void KategProblemKBC::addN(int w1,int w2, FreqType n) -{ - if(n!=0) - { - FreqType &s= _n[w1][w2]; - if(s==0) - eta0--; - else if(s==1) - eta1--; - if(_n1[w1]==0) - c1_0--; - if(_n2[w2]==0) - c2_0--; - - if(withVerfaelschung) - { - double verfOld=verfaelsche(s,sigmaVerfaelschung); - double verfNew=verfaelsche(s+n,sigmaVerfaelschung); - double verfOld1=verfaelsche(_n1[w1],sigmaVerfaelschung); - assert(verfOld1==_n1verf[w1]); - double verfNew1=verfaelsche(_n1[w1]+n,sigmaVerfaelschung); - double verfOld2=verfaelsche(_n2[w2],sigmaVerfaelschung); - assert(verfOld2==_n2verf[w2]); - double verfNew2=verfaelsche(_n2[w2]+n,sigmaVerfaelschung); - _n1verf[w1]=verfNew1; - _unigramVerfSum1+=verfNew1-verfOld1; - _n2verf[w2]=verfNew2; - _unigramVerfSum2+=verfNew2-verfOld2; - _nverf[w1][w2]=verfNew; - _bigramVerfSum+=verfNew-verfOld; - _nWords+=n; - } - s+=n;_n1[w1]+=n;_n2[w2]+=n; - - assert(_n[w1][w2]>=0); - assert(_n1[w1]>=0); - assert(_n2[w2]>=0); - - if(s==0) - eta0++; - else if(s==1) - eta1++; - if(_n1[w1]==0) - c1_0++; - if(_n2[w2]==0) - c2_0++; - } -}; -#endif diff --git a/ext/giza-pp/mkcls-v2/KategProblemTest.cpp b/ext/giza-pp/mkcls-v2/KategProblemTest.cpp deleted file mode 100644 index df88d18e..00000000 --- a/ext/giza-pp/mkcls-v2/KategProblemTest.cpp +++ /dev/null @@ -1,695 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include "KategProblemTest.h" - -#include "ProblemTest.h" -#include "HCOptimization.h" -#include "TAOptimization.h" -#include "RRTOptimization.h" -#include "GDAOptimization.h" - -#include -#include -#include -#include - -typedef pair PSS; - -#define NEW_SENTENCE_END "mkcls-mapped-dollar-symbol-$" - -#ifdef NeXT -char *strdup(char *a) -{ - char *p = (char *)malloc(strlen(a)+1); - strcpy(p,a); - return p; -} - -#endif - - -void writeClasses(Array &katOfWord,KategProblem &problem,ostream &to) -{ - for(int i=0;i translation(-1); - int maxCat=2; - ifstream in(fname); - if(!in) - { - cerr << "Error: File '" << fname << "' cannot be opened.\n"; - exit(1); - } - for(int i=0;iwordFreq.nWords;i++) - (p->initLike)[i]= -1; - - - translation["1"]=1; - translation["0"]=0; - - - string s; - while( getline(in,s) ) - { - string str,categ; - mysplit(s,str,categ); - int i=p->words->binary_locate(str); - if(i>=0 && (*(p->words))[i]==str ) - { - - if( translation[categ]==-1 ) - translation[categ]=maxCat++; - int cat=translation[categ]; - if( (p->initLike)[i]!= -1 ) - cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n"; - (p->initLike)[i]=cat; - } - else - cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n"; - } - - if( verboseMode ) - cout << "We have " << maxCat << " read non-empty categories" - " (with words from the corpus).\n"; - - if(maxCat>p->katFreq.nKats) - { - cerr << "Error: Not enough categories reserved (only " - << p->katFreq.nKats << ", but i need " << maxCat << ").\n"; - exit(1); - } - - - int i=p->words->binary_locate("$"); - if( i>=0 && (*(p->words))[i]=="$" ) - (p->initLike)[i]=0; - else - if( verboseMode ) - cerr << "Warning: No '$' in vocabulary!\n"; - - - int errors=0; - for(i=0;iwordFreq.nWords;i++) - if((p->initLike)[i]== -1 ) - { - if( verb ) cerr << "Error: I don't know the category of word " << i - << " (" << (*(p->words))[i] << ") " << ".\n"; - errors=1; - } - return errors; -} - - - -KategProblem *makeKategProblem(const leda_h_array&cTbl,const leda_set&setVokabular, int maxClass,int initialisierung, - int auswertung,int nachbarschaft,int minWordFrequency) -{ - - int nwrd=0; - leda_array&sVok = *new leda_array(setVokabular.size()); - string s; - unsigned int ctr=0; - forall_set(leda_set,s,setVokabular) - { - if( verboseMode>2 ) - cout << "mkcls:Wort " << ctr << " " << s << endl; - sVok[ctr++]=s; - } - for(unsigned int z=0;z2 ) - cout << "*****Vocabulary: " << sVok; - - unsigned int vokSize=sVok.size(); - massert(vokSize==ctr); massert(vokSize==setVokabular.size()); - if(verboseMode) - {cout << "Size of vocabulary: " << vokSize << "\n";cout.flush();} - - KategProblem *k = new KategProblem(vokSize,maxClass,initialisierung, - auswertung,nachbarschaft,minWordFrequency); - KategProblemWBC &w=k->wordFreq; - k->words=&sVok; - - Array after(vokSize,0); - Array before(vokSize,0); - - - nwrd=0; - { - PSS s; - forall_defined_h2(PSS,FreqType,s,cTbl) - { - const string&ss1=s.first; - const string&ss2=s.second; - if( ss2.length()&&(ss1!="$" || ss2!="$") ) - { - int i1=sVok.binary_search(ss1); - int i2=sVok.binary_search(ss2); - iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 ); - after[i1]++; - before[i2]++; - } - if( verboseMode&&((nwrd++)%10000==0) ) - {cout<<"Statistiken-1 " << nwrd<< ". \r";cout.flush();} - } - } - - for(unsigned int i=0;i2 ) - cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << " " - << ss2 << ":" << i2 << " " << p << endl; - } - if( verboseMode&&((nwrd++)%10000==0) ) - {cout<<"Statistiken-2 " < setVokabular; - leda_h_array cTbl; - double c=0; - if( verboseMode )cout << "NGRFILE: " << str << endl; - string s1,s2; - while(file >> c >> s1 >> s2) - { - if( s1.length()==0||s2.length()==0 ) - { - cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl; - return 0; - } - if( c==0 ) - { - cerr << "Count ist 0 " << s1 << " " << s2 << endl; - return 0; - } - cTbl[pair(s1,s2)]=(FreqType)c; - setVokabular.insert(s1); - setVokabular.insert(s2); - if( verboseMode>1 ) - cout << "R: " << s1 << " " << s2 << " " << c << endl; - c=0; - } - - return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency); -} - - - - - - - - -KategProblem *fromKModel(const char *str,int maxClass,int initialisierung, - int auswertung,int nachbarschaft,int minWordFrequency) -{ - string oldText,text,line; - ifstream f(str); - if( !f ) - { - cerr << "ERROR: can not open file " << str << ".\n"; - return 0; - } - - leda_set setVokabular; - leda_h_array cTbl(0); - oldText="$"; - while(1) - { - getline(f,line); - if(f.fail() && !f.bad() && !f.eof()) - { - cerr << "WARNING: strange characters in stream (getline) " << endl;f.clear(); - } - if(!f)break; - - istringstream f2(line); - while( 1 ) - { - f2 >> text; - if(f2.fail() && !f2.bad() && !f2.eof()) - { - cerr << "WARNING: strange characters in stream (>>) !\n"; - f2.clear(ios::failbit); - } - if(!f2){break;} - if( text == "$" ) - text = "mkcls-mapped-dollar-symbol-$"; - if( !setVokabular.member(text) )setVokabular.insert(text); - cTbl[pair(oldText,text)]++; - oldText=text; - } - text="$"; - if( !setVokabular.member(text) )setVokabular.insert(text); - cTbl[pair(oldText,text)]++; - oldText=text; - } - return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency); -} - - - - - -void KategProblemSetParameters(KategProblem &p) -{ - if( p.katwahl()==K_BEST ) - { - TAOptimization::defaultAnnRate=0.7; - RRTOptimization::defaultAnnRate=0.95; - GDAOptimization::defaultAlpha=0.05; - if( verboseMode ) - cout << "Parameter-setting like W-DET-BEST\n"; - } - else - { - TAOptimization::defaultAnnRate=0.4; - RRTOptimization::defaultAnnRate=0.6; - GDAOptimization::defaultAlpha=0.0125; - if( verboseMode ) - cout << "Parameter-setting like W-DET-DET\n"; - } -} - - - - -KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue, - int auswertung,int nachbarschaft,float relInit) -{ - KategProblem &k= - *new KategProblem(ANZ_WORD,ANZ_CLS,initValue,auswertung,nachbarschaft); - KategProblemWBC &w=k.wordFreq; - Array after(ANZ_WORD,0); - Array before(ANZ_WORD,0); - Array twoD(ANZ_WORD); - int i; - for(i=0;i &_izrOptimization(Array &probs, -int anzprob,double timeForOneRed,double maxClock,Array &katOfWord, -int anzIter,int verfahren) -{ - massert(anzprob>1); - massert(probs[0]->wordFreq.mindestAnzahl<=1); - KategProblem *p0=probs[0]; - - int nWords=p0->wordFreq.nWords; - int nKats=p0->katFreq.nKats; - int minimumNumberOfWords = max(1,int(nWords*0.95)); - - int indexOfDurchschnitt; - Array newWords(nWords); - int useAnzprob=anzprob; - do - { - int w,k; - indexOfDurchschnitt=0; - for(w=0;wwordFreq.nWords==nWords); - probs[k]->makeKats(); - } - - for(w=0;w durchschnitt=(*p0->kats)[p0->katOfWord(w)]; - for(k=1;kkats)[probs[k]->katOfWord(w)]; - - - int _anzInDurchschnitt=0; - int nr=0; - forall_set(leda_set,nr,durchschnitt) - { - _anzInDurchschnitt++; - newWords[nr]=indexOfDurchschnitt; - } - if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 ) - { - cout << "- ("; - forall_set(leda_set,nr,durchschnitt) - { - cout << p0->getString(nr); - if( p0->wordFreq.n1(nr)==1 ) - cout << "* "; - else - cout << " "; - } - cout << ")\n"; - } - - - - - for(k=0;kkats)[probs[k]->katOfWord(w)]; - } - indexOfDurchschnitt++; - } - } - - if(indexOfDurchschnitt>=minimumNumberOfWords) - { - if(useAnzprob==1) - { - cout << "useAnzProb==1 => mysterious.\n"; - break; - } - useAnzprob--; - } - } - while(indexOfDurchschnitt>=minimumNumberOfWords); - - - Array &neu=*new Array(MAX_MULTIPLE*anzprob,(KategProblem *)0); - qsort(probs.getPointerToData(),useAnzprob,sizeof(KategProblem *),compareProblem); - massert(useAnzprob<=probs.size()); - double startTime=clockSec(); - int i, numberOfNew; - for(numberOfNew=0; (clockSec()-startTimeinitialisierung,p0->auswertung,p0->nachbarschaft); - - for(w=0;wwordFreq.setAfterWords(w,5); - p->wordFreq.setBeforeWords(w,5); - } - for(w=0;w &after=p0->wordFreq.after[w]; - int size=after.size(); - for(i=0;iwordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n); - } - p->wordFreq.testFull(1); - - - - - - - p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words()); - double w1=0.0,w2=0.0; - if(numberOfNewinitLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i); - p->_initialize(5); - HCOptimization hc(*p,-1); - if(verboseMode) - { - w1=p->nicevalue(); - cout << "from old category system:" << w1 << endl; - } - hc.minimize(-1); - if(verboseMode) - { - w2=p->nicevalue(); - if(w2_initialize(1); - double mean; - StatVar end,laufzeit,start; - solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start); - w2=p->value(); - if(verboseMode) - cout << "new category system: " << w2 << " (" << p->nicevalue() - << ") Zeit: " << clockSec() << "\n"; - } - } - int p; - for(p=0;pvalue() << " " - << neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: " - << clockSec() << endl; - if( indexOfDurchschnitt<=nKats - || (clockSec()>maxClock&&maxClock) ) - { - if( clockSec()>maxClock&&maxClock ) - cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n"; - for(i=0;ikatOfWord(newWords[i]); - return neu; - } - else - { - Array &newKatOfWord= - *(new Array(neu[0]->wordFreq.nWords,-1)); - Array &erg=_izrOptimization(neu,anzprob,timeForOneRed, - maxClock,newKatOfWord, - anzIter+1,verfahren); - for(i=0;i katOfWord(p.wordFreq.nWords,-1); - int startN; - if( clockForOneRed<=0 ) - startN=firstN; - else - startN=1000; - Array probs(startN); - double val1=0.0,val2=0.0; - double endTime=-1; - - double startTime=clockSec(); - int i; - for(i=0;i=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) ) - break; - } - if( endTime<0 ) - endTime=clockSec(); - massert(i>=firstN); - - qsort(probs.getPointerToData(),i,sizeof(KategProblem *),compareProblem); - massert(i<=probs.size()); - if( clockForOneRed<=0 ) - { - clockForOneRed=endTime-startTime; - if( verboseMode ) - cout << "time for one reduction: " << clockForOneRed << endl; - } - _izrOptimization(probs,minN,clockForOneRed,maxClock,katOfWord,0,verfahren); - - KategProblem *n=(KategProblem *)(p.makeEqualProblem()); - n->initLike= katOfWord; - n->_initialize(5); - if( verboseMode ) - val1=n->value(); - HCOptimization hc(*n,-1); - hc.minimize(-1); - val2=n->value(); - if( verboseMode ) - cout << "last improvement: " << val2-val1 << "\n"; - cout << "final costs: " << val2 << " " << n->nicevalue() << endl; - if(PrintBestTo) - n->dumpOn(*PrintBestTo); - return n; -} - - - - - - - - - - - diff --git a/ext/giza-pp/mkcls-v2/KategProblemTest.h b/ext/giza-pp/mkcls-v2/KategProblemTest.h deleted file mode 100644 index 7767b7d9..00000000 --- a/ext/giza-pp/mkcls-v2/KategProblemTest.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - - -#include "KategProblem.h" - - -KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initialisierung, - int auswertung,int nachbarschaft,float relInit=0.1); - - - -KategProblem *fromKModel(const char *str,int maxClass,int initialisierung, - int auswertung,int nachbarschaft,int minWordFrequency); - - -KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung, - int auswertung,int nachbarschaft,int minWordFrequency); - -void writeClasses(Array &katOfWord,KategProblem &problem,ostream &to); - - - -int fromCatFile(KategProblem *p,const char *s,bool verb=1); - - - -KategProblem *izrOptimization(KategProblem &p,int minN,int firstN, -double clockForOneRed,double maxClock,int verfahren); - - - -void KategProblemSetParameters(KategProblem &p); - - diff --git a/ext/giza-pp/mkcls-v2/KategProblemWBC.cpp b/ext/giza-pp/mkcls-v2/KategProblemWBC.cpp deleted file mode 100644 index 1a0d4397..00000000 --- a/ext/giza-pp/mkcls-v2/KategProblemWBC.cpp +++ /dev/null @@ -1,344 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include -#include "KategProblem.h" - -static int oneFreqCompareSteigend(const void *p,const void *j) -{ -#ifdef FREQTYPE_DOUBLE - if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) ) - return -1; - if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) ) - return +1; - else - return 0; -#else - return ((OneFreq *)p)->n - ((OneFreq *)j)->n; -#endif -} -static int oneFreqCompareFallend(const void *p,const void *j) -{ -#ifdef FREQTYPE_DOUBLE - if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) ) - return -1; - if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) ) - return +1; - else - return 0; -#else - return -((OneFreq *)p)->n + ((OneFreq *)j)->n; -#endif -} - - -KategProblemWBC::KategProblemWBC(int n,int minw) -: _n1(n,0),_n2(n,0),with_h_of_words(0),afterFilled(n,0),beforeFilled(n,0),filled(0),fixedWord(n,-1),absteigend(0),nWords(n),nTranspWords(0), - mindestAnzahl(minw),after(n),before(n),minIndex(n,-1),maxIndex(n,-1) - -{ -} - -KategProblemWBC::~KategProblemWBC() - -{ - massert( after.size()==nWords); - if( absteigend ) - delete absteigend; -} - -void KategProblemWBC::init(int specialFixedWord) -{ - - nTranspWords=0; - int i; - for(i=0;i<_n1.size();i++) - { - if( (_n1[i]1 ) - { - cout << "MEAN(|L(w)|+|R(w)|)=" << (beforeFilledSum/(float)nWords) - +(afterFilledSum/(float)nWords) << endl; - cout << "Hapaslegomena: " << enaNom << endl; - } - int symmetrisch=1; - for(i=0;i1 ) - cout << "Asymmetrie: " << i << " " << _n1[i] << " " << _n2[i] << endl; - } - - } - if(verboseMode && symmetrisch==0) - cout << "Warning: word bigram statistic is not symmetric " - "(this is possibly an error)\n"; - return ret; -} - -Array &KategProblemWBC::getSortedList(int steigend) - -{ - int siz=_n2.size(),i; - massert(filled); - Array &sortedList =*new Array(siz); - Array list(siz); - int pos=0; - for(i=0;i=0 ) - { - list[pos].w=i; - list[pos].n=_n1[i]; - pos++; - } - } - massert(pos==siz); - if(steigend ) - qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareSteigend); - else - qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareFallend); - massert( anzFree<=list.size() ); - - for(i=0;i=anzFree || list[i-1].n>=list[i].n ); - massert((!steigend) || i==0 || i>=anzFree || list[i-1].n<=list[i].n ); - } - return sortedList; -} - -FreqType KategProblemWBC::numberOfWords() - -{ - FreqType n1=0,n2=0; - for(int i=0;i<_n1.size();i++) - { - n1+=_n1[i]; - n2+=_n2[i]; - } - #ifndef FREQTYPE_DOUBLE - massert(n1==n2); - #endif - return n1; -} - -void KategProblemWBC::setDollar(int n) - -{ - if( fixedWord[n]<0 ) - nTranspWords--; - fixedWord[n]=0; -} - -void KategProblemWBC::initializeIndex(const leda_array&words,char firstChar,int unten,int oben,bool noHapas) -{ - int n=0; - int i; - massert(-1=mindestAnzahl || ((short)(n2(i)+0.0001))>=mindestAnzahl) ) - { - minIndex[i]=unten; - maxIndex[i]=oben; - n++; - } - } - if( verboseMode ) - cout << "InitializeIndex gefunden fuer " << n << " Woerter.\n"; -} - diff --git a/ext/giza-pp/mkcls-v2/KategProblemWBC.h b/ext/giza-pp/mkcls-v2/KategProblemWBC.h deleted file mode 100644 index 8a399e5b..00000000 --- a/ext/giza-pp/mkcls-v2/KategProblemWBC.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - - -#ifndef KATEGPROBLEMWBC_H -#define KATEGPROBLEMWBC_H - -struct OneFreq -{ - int w; - FreqType n; -}; - -typedef Array ManyFreq; - -class KategProblemWBC - -{ - - friend class KategProblem; - - private: - Array _n1; - - Array _n2; - - - double h_of_words; - - - short with_h_of_words; - - Array afterFilled; - Array beforeFilled; - - Array &getSortedList(int steigend); - - - protected: - KategProblemWBC(int n,int minw); - - - ~KategProblemWBC(); - - - short filled; - - Array fixedWord; - Array *absteigend; - - void init(int specialFixedWord=-1); - - - public: - int nWords; - int nTranspWords; - short mindestAnzahl; - Array after; - Array before; - Array minIndex; - Array maxIndex; - - - - void setAfterWords(int w,int anzahl); - - - void setBeforeWords(int w,int anzahl); - - - void setFreq(int w1,int w2, FreqType anzahl); - - - void addFreq(int w1,int w2,FreqType anzahl); - - - void setDollar(int n); - - - int fixed(int w) - { - return fixedWord[w]; - } - - FreqType n1(int w) { return _n1[w];}; - - - FreqType n2(int w) { return _n2[w];}; - - - FreqType numberOfWords(); - - - short testFull(int doIt=0); - - - double get_h_of_words(); - - - void set_h_of_words(double s); - - - void initializeIndex(const leda_array&words,char firstChar,int min,int max,bool noHapas); -}; - -#endif diff --git a/ext/giza-pp/mkcls-v2/LICENSE b/ext/giza-pp/mkcls-v2/LICENSE deleted file mode 100644 index 5b2225e4..00000000 --- a/ext/giza-pp/mkcls-v2/LICENSE +++ /dev/null @@ -1,282 +0,0 @@ - - -Preamble - -The licenses for most software are designed to take away your freedom -to share and change it. By contrast, the GNU General Public License is -intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Library General Public License instead.) You can apply it to -your programs, too. - -When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - -To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the software, or if you modify it. - -For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - -We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - -Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, -we want its recipients to know that what they have is not the -original, so that any problems introduced by others will not reflect -on the original authors' reputations. - -Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at -all. - -The precise terms and conditions for copying, distribution and -modification follow. - - -TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - -0. This License applies to any program or other work which contains a -notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the Program -(independent of having been made by running the Program). Whether that -is true depends on what the Program does. - -1. You may copy and distribute verbatim copies of the Program's source -code as you receive it, in any medium, provided that you conspicuously -and appropriately publish on each copy an appropriate copyright notice -and disclaimer of warranty; keep intact all the notices that refer to -this License and to the absence of any warranty; and give any other -recipients of the Program a copy of this License along with the -Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a -fee. - -2. You may modify your copy or copies of the Program or any portion of -it, thus forming a work based on the Program, and copy and distribute -such modifications or work under the terms of Section 1 above, -provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that - in whole or in part contains or is derived from the Program or - any part thereof, to be licensed as a whole at no charge to all - third parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you - provide a warranty) and that users may redistribute the program - under these conditions, and telling the user how to view a copy - of this License. (Exception: if the Program itself is interactive - but does not normally print such an announcement, your work based - on the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - -3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of - Sections 1 and 2 above on a medium customarily used for software - interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - -4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt otherwise -to copy, modify, sublicense or distribute the Program is void, and -will automatically terminate your rights under this License. However, -parties who have received copies, or rights, from you under this -License will not have their licenses terminated so long as such -parties remain in full compliance. - -5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - -6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted -herein. You are not responsible for enforcing compliance by third -parties to this License. - - -7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - -8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - -9. The Free Software Foundation may publish revised and/or new -versions of the General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Program does not specify a -version number of this License, you may choose any version ever -published by the Free Software Foundation. - -10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the -author to ask for permission. For software which is copyrighted by the -Free Software Foundation, write to the Free Software Foundation; we -sometimes make exceptions for this. Our decision will be guided by the -two goals of preserving the free status of all derivatives of our free -software and of promoting the sharing and reuse of software generally. - -NO WARRANTY - -11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE -LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS -AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF -ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - -12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - -END OF TERMS AND CONDITIONS diff --git a/ext/giza-pp/mkcls-v2/MSBOptimization.cpp b/ext/giza-pp/mkcls-v2/MSBOptimization.cpp deleted file mode 100644 index 94788265..00000000 --- a/ext/giza-pp/mkcls-v2/MSBOptimization.cpp +++ /dev/null @@ -1,229 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include "MSBOptimization.h" -#include -#include "ProblemTest.h" - -#ifdef __GNUC__ -template class Array; -template class Array; -#endif - -struct doubleInt { double a; int i; }; -static int doubleintcompare(const void *p,const void *j) -{ - if(((struct doubleInt *)p)->a < ((doubleInt *)j)->a) - return -1; - else if(((struct doubleInt *)p)->a == ((doubleInt *)j)->a) - return 0; - else - return 1; -} - - -MSBOptimization::MSBOptimization(Problem &p,int verf,int anz,Array &pos,Array &por) -: PopOptimization(p,verf,anz), -percentOfSteps(pos),percentOfRun(por),nachMinimierung(0) -{ -} - - -void MSBOptimization::zInitialize() -{ - PopOptimization::zInitialize(); - - int iterationsschritte; - double mean; - StatVar end,laufzeit,start; - zufallSeed(); - - - - - solveProblem(ProblemTestVerboseMode,*originalProblem,2,-1,verfahren,mean, - end,laufzeit,start,0,&iterationsschritte); - expectedSteps=(int)(iterationsschritte); - - if(verboseMode) - cout << "MSB:mean number of steps for one run: " << expectedSteps << endl; -} - - -double MSBOptimization::minimize(int) -{ - if( initialisiert==0 ) - zInitialize(); - - int i; - int anz=size(); - int numproblems=anz; - - if( verboseMode ) - { - double usedSteps=0; - for(i=0;iminimize(steps); - if(verboseMode)cout << "MSB:" << i << " " << a << ":" << v << endl; - } - - sort(); - - if(verboseMode) - cout << "MSB: best:" << problem(0)->value() - << " worst:" << problem(numproblems-1)->value() << endl; - - - numproblems=(int)(anz*(1.0-percentOfRun[i])); - if( numproblems<1 ) - numproblems=1; - if(verboseMode) - cout << "MSB: now i have : " << numproblems << " Problem's." << endl; - if(numproblems==1) - break; - } - assert( numproblems>0 ); - - - for(int a=0;aminimize(-1); - sort(); - - double ergebnisWert = problem(0)->value(); - cout << "MSB: value:" << ergebnisWert << " (nicevalue:" - << problem(0)->nicevalue() << ")\n"; - nachMinimierung=1; - return ergebnisWert; -} - - - -void MSBOptimization::optimizeValues(Problem &p,int verfahren) -{ - int i; - struct doubleInt ri[20]; - double mean; - StatVar end,laufzeit,start; - solveProblem(ProblemTestVerboseMode,p,5,-1,verfahren,mean,end,laufzeit,start); - double fivePercentSteps=(int)(laufzeit.getMean()/20.0); - double qualitaet[20][20]; - for(i=0;i<20;i++) - { - Optimization *o=(Optimization *)genIterOptimizer(verfahren,p,-1); - for(int a=0;a<20;a++) - { - qualitaet[i][a]=o->minimize((int)fivePercentSteps); - cout << qualitaet[i][a] << " "; - } - ri[i].a=o->minimize(-1); - ri[i].i=i; - cout << ri[i].a << endl; - delete o; - } - qsort(ri,20,sizeof(struct doubleInt),doubleintcompare); - - cout << "#Beschneidungsmatrix, welche die drei besten Laeufe erhaelt: "; - for(i=0;i<20;i++) - { - int a; - struct doubleInt v[20]; - for(a=0;a<20;a++) - { v[a].i=a;v[a].a=qualitaet[a][i];} - qsort(v,20,sizeof(struct doubleInt),doubleintcompare); - int nr=0; - for(a=0;a<20;a++) - if( v[a].i==ri[0].i || v[a].i==ri[1].i || v[a].i==ri[2].i ) - nr=a; - float percent=(1.0-nr/20.0)*100.0; - if(nr==2) - percent=100.0; - cout << "# " << i << " " << (i/20.0)*100 << "% " << percent << "%\n"; - } - cout << "#Beschneidungsmatrix, welche die zwei besten Laeufe erhaelt: "; - for(i=0;i<20;i++) - { - int a; - struct doubleInt v[20]; - for(a=0;a<20;a++) - { v[a].i=a;v[a].a=qualitaet[a][i];} - qsort(v,20,sizeof(struct doubleInt),doubleintcompare); - int nr=0; - for(a=0;a<20;a++) - if( v[a].i==ri[0].i || v[a].i==ri[1].i ) - nr=a; - float percent=(1.0-nr/20.0)*100.0; - if(nr==1) - percent=100.0; - cout << "# " << i << " " << (i/20.0)*100 << "% " << percent << "%\n"; - } - cout << "#Beschneidungsmatrix, welche den besten Lauf erhaelt: "; - for(i=0;i<20;i++) - {int a; - struct doubleInt v[20]; - for(a=0;a<20;a++) - { v[a].i=a;v[a].a=qualitaet[a][i];} - qsort(v,20,sizeof(struct doubleInt),doubleintcompare); - int nr=0; - for(a=0;a<20;a++) - if( v[a].i==ri[0].i ) - nr=a; - float percent=(1.0-nr/20.0)*100.0; - if(nr==0) - percent=100.0; - cout << "# " << i << " " << (i/20.0)*100 << "% " << percent << "%\n"; - } -} - - -Problem& MSBOptimization::bestProblem() -{ - assert(nachMinimierung==1); - return *(problem(0)); -} diff --git a/ext/giza-pp/mkcls-v2/MSBOptimization.h b/ext/giza-pp/mkcls-v2/MSBOptimization.h deleted file mode 100644 index ab30c984..00000000 --- a/ext/giza-pp/mkcls-v2/MSBOptimization.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - - -#ifndef MSBOPTIMIZATION -#define MSBOPTIMIZATION - -#include "PopOptimization.h" - -class MSBOptimization : public PopOptimization - { - - protected: - - Array percentOfSteps; - Array percentOfRun; - - int expectedSteps; - short nachMinimierung; - - virtual void zInitialize(); - - - public: - MSBOptimization(Problem &s,int verf,int anz,Array &pos, - Array &por); - - - virtual ~MSBOptimization(){} - - virtual double minimize(int steps=-1); - - - static void optimizeValues(Problem &p,int verfahren); - - - Problem& bestProblem(); - - -}; -#endif - - - - - - - - - - - diff --git a/ext/giza-pp/mkcls-v2/MYOptimization.cpp b/ext/giza-pp/mkcls-v2/MYOptimization.cpp deleted file mode 100644 index ced9d318..00000000 --- a/ext/giza-pp/mkcls-v2/MYOptimization.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include "MYOptimization.h" - -MYOptimization::MYOptimization(Problem &p,int m) -: IterOptimization(p,m),acceptFlagsNumber(0),acceptions(0),total(0) -{ -} -MYOptimization::MYOptimization(MYOptimization &o) -: IterOptimization(o),acceptFlagsNumber(0),acceptions(0),total(0) -{ -} -short MYOptimization::accept(double delta) - { - int doIt; - int verbesserung = delta<0; - if( delta < 0 ) - doIt=1; - else - { - if(total>=NUMBER_OF_ACCEPTIONS) - { - double prob = acceptions/(float)(NUMBER_OF_ACCEPTIONS); - double zuf = zufall01(); - - doIt=zuf=NUMBER_OF_ACCEPTIONS ) - { - if( acceptFlags[acceptFlagsNumber] ) - acceptions--; - } - acceptFlags[acceptFlagsNumber]=verbesserung; - if( verbesserung ) - acceptions++; - total++; - acceptFlagsNumber++; - if(acceptFlagsNumber>=NUMBER_OF_ACCEPTIONS) - acceptFlagsNumber=0; - return doIt; - } - -short MYOptimization::end() - { - return endFlag>0 && total>NUMBER_OF_ACCEPTIONS && acceptions==0; - } -void MYOptimization::abkuehlen() - { - } - - - -void MYOptimization::makeGraphOutput() -{ - IterOptimization::makeGraphOutput(); - *GraphOutput << acceptions; -} - diff --git a/ext/giza-pp/mkcls-v2/MYOptimization.h b/ext/giza-pp/mkcls-v2/MYOptimization.h deleted file mode 100644 index a6ca70ca..00000000 --- a/ext/giza-pp/mkcls-v2/MYOptimization.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - -#ifndef MYOPTIMIZATION -#define MYOPTIMIZATION -#include "IterOptimization.h" - -#define NUMBER_OF_ACCEPTIONS 100 - -class MYOptimization: public IterOptimization { - - protected: - virtual short accept(double delta); - - - virtual void abkuehlen(); - - - virtual short end(); - - - public: - MYOptimization(Problem &p,int maxIter=-1); - - - MYOptimization(MYOptimization &o); - - - int acceptFlags[NUMBER_OF_ACCEPTIONS],acceptFlagsNumber; - int acceptions,total; - - void makeGraphOutput(); - -}; - -#endif diff --git a/ext/giza-pp/mkcls-v2/Makefile b/ext/giza-pp/mkcls-v2/Makefile deleted file mode 100644 index cec1673a..00000000 --- a/ext/giza-pp/mkcls-v2/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -OBJS = GDAOptimization.o HCOptimization.o Problem.o \ - IterOptimization.o ProblemTest.o RRTOptimization.o \ - MYOptimization.o SAOptimization.o TAOptimization.o \ - Optimization.o KategProblemTest.o KategProblemKBC.o \ - KategProblemWBC.o KategProblem.o StatVar.o general.o \ - mkcls.o - -CFLAGS = $(CFLAGS_GLOBAL) -Wall -W -DNDEBUG -O3 -funroll-loops -std=c++11 - -.cpp.o: - $(CXX) $(CFLAGS) -c $< -o $@ - -LDFLAGS = - -mkcls: $(OBJS) - $(CXX) $(CFLAGS) -o mkcls $(OBJS) $(LDFLAGS) - -remove clean: - -rm -f *.o mkcls - - - - diff --git a/ext/giza-pp/mkcls-v2/Optimization.cpp b/ext/giza-pp/mkcls-v2/Optimization.cpp deleted file mode 100644 index 03e06df0..00000000 --- a/ext/giza-pp/mkcls-v2/Optimization.cpp +++ /dev/null @@ -1,30 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include "Optimization.h" - -Optimization::~Optimization() {} - diff --git a/ext/giza-pp/mkcls-v2/Optimization.h b/ext/giza-pp/mkcls-v2/Optimization.h deleted file mode 100644 index 4c434279..00000000 --- a/ext/giza-pp/mkcls-v2/Optimization.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - - -#ifndef OPTIMIZATION -#define OPTIMIZATION - -#include "Problem.h" -#include "general.h" - -class Optimization -{ - -public: - - virtual double minimize(int steps)=0; - virtual ~Optimization(); - -}; -#endif - - - - diff --git a/ext/giza-pp/mkcls-v2/PopOptimization.cpp b/ext/giza-pp/mkcls-v2/PopOptimization.cpp deleted file mode 100644 index 2e65a2c8..00000000 --- a/ext/giza-pp/mkcls-v2/PopOptimization.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include "PopOptimization.h" -#include "ProblemTest.h" - - -int compareProbAndOpt(const void *p,const void *j) -{ - double a=((ProbAndOpt *)p)->prob->value(); - double b=((ProbAndOpt *)j)->prob->value(); - if(a==b) - return 0; - if(avalue()value(); - } -bool operator==(const ProbAndOpt&a, const ProbAndOpt&b) - { - return a.prob->value()==b.prob->value(); - } - -ostream& operator<<(ostream&o , const ProbAndOpt&){return o;} -istream& operator>>(istream&i , ProbAndOpt&){return i;} - - - -PopOptimization::PopOptimization(Problem &p,int verf,int anz) -: probandopt(anz),initialisiert(0),verfahren(verf) -{ - originalProblem = &p; -} - - -int PopOptimization::size() -{ - return probandopt.size(); -} - -Problem *PopOptimization::problem(int i) -{ - assert(initialisiert); - return probandopt[i].prob; -} - -Optimization *PopOptimization::optimization(int i) -{ - assert(initialisiert); - return probandopt[i].opt; -} - -void PopOptimization::zInitialize() -{ - int i; - zufallSeed(); - for(i=0;imakeEqualProblem(); - probandopt[i].prob->initialize(); - } - - zufallSeed(); - for(i=0;i>(istream& , ProbAndOpt&b); - -inline DEFINE_STANDARD_COMPARE(ProbAndOpt) - -int compareProbAndOpt(const void *p,const void *j); - -class PopOptimization : public Optimization { - - - private: - Array probandopt; - - protected: - int initialisiert; - Problem *originalProblem; - - - int verfahren; - - - virtual void zInitialize(); - - - public: - PopOptimization(Problem &s,int verf,int anz); - - - virtual ~PopOptimization() {} - - int size(); - - - void sort(); - - - virtual Problem& bestProblem()=0; - - - Problem *problem(int i); - - - Optimization *optimization(int i); - - -}; -#endif diff --git a/ext/giza-pp/mkcls-v2/Problem.cpp b/ext/giza-pp/mkcls-v2/Problem.cpp deleted file mode 100644 index 6e126c80..00000000 --- a/ext/giza-pp/mkcls-v2/Problem.cpp +++ /dev/null @@ -1,165 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - -#include "Problem.h" -#include "Optimization.h" - -Problem::~Problem() {} - -Problem::Problem(int max,int anz,int _initialisierung,int _auswertung, - int _nachbarschaft) -: initialized(0),curCompVal(0),curCompChange(0),maxCompVal(max),maxComp(anz),curComp(0), - initialisierung(_initialisierung),auswertung(_auswertung),nachbarschaft(_nachbarschaft), - numberOfFullEvaluations(0),numberOfPartEvaluations(0),numberOfDoChange(0) -{ - if( verboseMode>1 ) - cout << "Initialization of Problem: " << maxComp << " " << maxCompVal - << endl; -} - -void Problem::initialize(int i) -{ - curComp=curCompVal=curCompChange=0; - numberOfFullEvaluations=numberOfPartEvaluations=numberOfDoChange=0; - initialized=1; - if( i== -23 ) - _initialize(initialisierung); - else - _initialize(i); - maxComp=maxDimension(); - maxCompVal=maxDimensionVal(); -} - -void Problem::doChange(ProblemChange &c) -{ - assert (initialized); - curCompChange=1; - _doChange(c); - numberOfDoChange++; -} - -void Problem::incrementDirection() -{ - if( maxCompVal==curCompVal ) - curCompVal=0; - curCompChange=0; - curComp=(curComp+1)%maxComp; -} - -ProblemChange& Problem::change() -{ - assert( initialized ); - assert( maxCompVal>=curCompVal); - - if( curCompChange||maxCompVal==curCompVal ) - incrementDirection(); - - ProblemChange *p; - int changeFound=_change(&p); - curCompVal++; - if( changeFound==0 ) - return change(); - else - return *p; -} -double Problem::value() -{ - numberOfFullEvaluations++; - if( !initialized ) - initialize(); - return _value(); -} - -double Problem::valueChange(ProblemChange &x) -{ - numberOfPartEvaluations++; - assert( initialized ); - double currentValue=value(); - _doChange(x);numberOfDoChange++; - double newValue=value(); - _undoChange(x);numberOfDoChange++; - assert( currentValue==value() ); - return newValue-currentValue; -} - -void Problem::dumpOn(ostream &strm) -{ - assert( initialized ); - strm << "Problem(" << initialisierung << "," << auswertung << "," - << nachbarschaft << ")\n"; - strm << " #value: " << numberOfFullEvaluations << endl; - strm << "#valueChange: " << numberOfPartEvaluations << endl; - strm << " #doChange: " << numberOfDoChange << endl; -} - -StatVar& Problem::deviationStatVar(Optimization &s,int anz) -{ - assert( initialized ); - StatVar &v=*new StatVar; - double cur=value(); - int howOften=0; - while( v.getNum()50000 ) - break; - double neuer=s.minimize(1); - if( neuer>cur ) - v.addValue(neuer-cur); - cur=neuer; - vassert(NULLFLOAT(cur-value())); - } - return v; -} - -void Problem::dumpInfos(ostream &strm) -{ - strm << "Problem: " << endl; - assert( initialized ); -} - - -double Problem::nicevalue(double) -{ - return value(); -} - -int Problem::maxDimensionVal(void) {return -1;} -int Problem::maxDimension(void) {return -1;} - -ProblemChange::~ProblemChange() - { - } - -ProblemChange::ProblemChange() - { - } - -void Problem::setValuesFrom(Problem *p) -{ - numberOfFullEvaluations=p->numberOfFullEvaluations; - numberOfPartEvaluations=p->numberOfPartEvaluations; - numberOfDoChange=p->numberOfDoChange; - initialized=p->initialized; -} diff --git a/ext/giza-pp/mkcls-v2/Problem.h b/ext/giza-pp/mkcls-v2/Problem.h deleted file mode 100644 index 337390ea..00000000 --- a/ext/giza-pp/mkcls-v2/Problem.h +++ /dev/null @@ -1,159 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - - -#ifndef PROBLEMCHANGE -#define PROBLEMCHANGE -#include -#include "general.h" -#include "StatVar.h" - -class Optimization; - -class ProblemChange - -{ - public: - virtual ~ProblemChange(); - ProblemChange(); -}; - -class Problem { - - private: - short initialized; - int curCompVal; - short curCompChange; - int maxCompVal; - int maxComp; - - - protected: - int curComp; - - void setValuesFrom(Problem *p); - - virtual int maxDimensionVal(void) ; - - - virtual int maxDimension(void) ; - - - inline int curDimension(void) { assert(maxComp!=-1);return curComp;} - - - inline int curDimensionVal(void) { assert(maxComp!=-1);return curCompVal;} - - - - virtual void _doChange(ProblemChange &c)=0; - - - virtual int _change(ProblemChange **p)=0; - - - virtual void _undoChange(ProblemChange &c)=0; - - - virtual void _initialize(int initialisierung)=0; - - - virtual double _value()=0; - - - public: - Problem(int maxCompVal=-1,int maxComp=-1,int _initialisierung=0, - int _auswertung=0,int _nachbarschaft=0); - - virtual ~Problem(); - - - void doChange(ProblemChange &c); - - - ProblemChange& change(); - - - virtual double value(); - - - virtual double valueChange(ProblemChange &c); - - - virtual void initialize(int a= -23); - - - inline virtual short endCriterion(); - - - virtual int maxNonBetterIterations()=0; - - - virtual int expectedNumberOfIterations()=0; - - - virtual void dumpOn(ostream &strm); - - - virtual void dumpInfos(ostream &strm); - - - virtual Problem *makeEqualProblem()=0; - - - virtual double nicevalue(double vorher=1e100); - - - virtual StatVar& deviationStatVar(Optimization &s,int anz); - - - virtual void incrementDirection(); - - - - - - int initialisierung; - int auswertung; - int nachbarschaft; - - int numberOfFullEvaluations; - int numberOfPartEvaluations; - int numberOfDoChange; - - - -}; - -inline short Problem::endCriterion() -{ - return 0; -}; - -#endif - diff --git a/ext/giza-pp/mkcls-v2/ProblemTest.cpp b/ext/giza-pp/mkcls-v2/ProblemTest.cpp deleted file mode 100644 index 60ca39d9..00000000 --- a/ext/giza-pp/mkcls-v2/ProblemTest.cpp +++ /dev/null @@ -1,263 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include "ProblemTest.h" -#include "HCOptimization.h" -#include "RRTOptimization.h" -#include "SAOptimization.h" -#include "TAOptimization.h" -#include "GDAOptimization.h" -#include "MYOptimization.h" -#include -#include "general.h" -#include - -short ProblemTestVerboseMode=1; -ofstream *PrintBestTo=0,*PrintBestTo2=0; - - -int compareProblem(const void *p,const void *j) -{ - double a=(*(Problem **)p)->value(); - double b=(*(Problem **)j)->value(); - if(a==b) - return 0; - if(a2) - { - cout << " " << i << " of " << versuche << ".\n"; - cout.flush(); - } - double vorher=clockSec(); - - IterOptimization *opt=genIterOptimizer(verfahren,problem, - optimierungsschritte); - problem.numberOfPartEvaluations=0; - - startNice.addValue(problem.nicevalue()); - start.addValue(problem.value()); - - double v=opt->minimize(optimierungsschritte); - - if( problem.numberOfPartEvaluations==0) - auswertungen.addValue(opt->getCurStep()); - else - auswertungen.addValue(problem.numberOfPartEvaluations); - iterschritte.addValue(opt->getCurStep()); - - endNice.addValue(problem.nicevalue()); - end.addValue(problem.value()); - dauer.addValue(clockSec()-vorher); - if( verbose>2 ) - { - cout << i << ". " << v << ": "; - problem.dumpOn(cout); - } - delete opt; - if( v1 ) - { - bestP=problem.makeEqualProblem(); - smallestV=v; - } - if( verbose>2 ) - cout << " time: " << clockSec() << " best:" << endNice.quantil(0) - << " this:" << problem.nicevalue() << endl; - if( maxClock && clockSec()>maxClock ) - { - if(verbose) - cout << "Stop because of time limit ( " << (clockSec()-maxClock) - << " Sekunden)\n"; - break; - } - } - - if(verbose) - { - cout << "\n***** " << start.getNum() << " runs. (algorithm:"; - switch(verfahren) - { - case HC_OPT: - cout << "HC"; - break; - case RRT_OPT: - cout << "RRT"; - break; - case GDA_OPT: - cout << "GDA"; - break; - case TA_OPT: - cout << "TA"; - break; - case SA_OPT: - cout << "SA"; - break; - case MY_OPT: - cout << "MY"; - break; - default: - cout << "!unknown!"; - } - cout << ")*****\n"; - problem.dumpInfos(cout); - cout << endl; - cout << "start-costs: "; start.dumpOn(cout); cout << endl; - cout << " end-costs: "; end.dumpOn(cout); cout << endl; - cout << " start-pp: "; startNice.dumpOn(cout); cout << endl; - cout << " end-pp: "; endNice.dumpOn(cout); cout << endl; - cout << " iterations: "; auswertungen.dumpOn(cout); cout << endl; - cout << " time: "; dauer.dumpOn(cout); - cout << endl; - } - if( bestP ) - { - if(PrintBestTo) - bestP->dumpOn(*PrintBestTo); - else - bestP->dumpOn(cout); - delete bestP; - } - mean = end.getMean(); - if( iterationsschritte ) - *iterationsschritte=(int)(iterschritte.getMean()); - return end.getSmallest(); -} - - - -void multiSolveProblem(Problem &problem,int versuche,int maxSeconds) -{ - int i; - double rDummy; - StatVar end[MAX_OPT_NR],auswertungen[MAX_OPT_NR],start[MAX_OPT_NR]; - double maxClock=clockSec()+maxSeconds; - if(maxSeconds<=0)maxClock=0; - solveProblem(ProblemTestVerboseMode,problem,versuche,-1,HC_OPT,rDummy, - end[HC_OPT],auswertungen[HC_OPT],start[HC_OPT],maxClock); - //int maxLaeufe=(int)(auswertungen[HC_OPT].getMean()*5); - for(i=0;i - - -enum {TA_OPT, HC_OPT, SA_OPT,RRT_OPT,GDA_OPT,MY_OPT,MAX_OPT_NR }; - -class IterOptimization; - -extern short ProblemTestVerboseMode; - -extern ofstream *PrintBestTo,*PrintBestTo2; - -double solveProblem(int verbose,Problem &problem,int versuche, -int optimierungsschritte,int verfahren,double &mean,StatVar &endValue, -StatVar &laufzeit,StatVar &initValue,double maxSec= 0,int *iterationsschritte=0); - - - -int compareProblem(const void *p,const void *j); - - - -void multiSolveProblem(Problem &problem,int versuche,int maxSeconds); - - - -IterOptimization *genIterOptimizer(int verfahren,Problem &problem,int maxIter); - - -void metaOptimization(Problem &p,int nLaeufe,int nPars); - -#endif diff --git a/ext/giza-pp/mkcls-v2/README b/ext/giza-pp/mkcls-v2/README deleted file mode 100644 index 8e453df6..00000000 --- a/ext/giza-pp/mkcls-v2/README +++ /dev/null @@ -1,10 +0,0 @@ -======================================================================== -mkcls is a tool to train word classes by using a -maximum-likelihood-criterion. The resulting word classes are -especially suited for language models or statistical translation -models. The program mkcls was written by Franz Josef Och -(och@informatik.rwth-aachen.de) -======================================================================== - -In order to know about the options of mkcls simply start the program -without arguments. diff --git a/ext/giza-pp/mkcls-v2/RRTOptimization.cpp b/ext/giza-pp/mkcls-v2/RRTOptimization.cpp deleted file mode 100644 index 55e21225..00000000 --- a/ext/giza-pp/mkcls-v2/RRTOptimization.cpp +++ /dev/null @@ -1,217 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include "RRTOptimization.h" -#include "ProblemTest.h" - -double RRTOptimization::defaultAnnRate=0.6; -double RRTOptimization::defaultMultiple=2.0; - - - -RRTOptimization::RRTOptimization(Problem &p,double t,double dt,int m) -: IterOptimization(p,m),deviation(t),deltaDeviation(dt) -{ - assert(deviation>=0); -} - - - -RRTOptimization:: RRTOptimization(Problem &p,int m) -: IterOptimization(p,m),deviation(-1),deltaDeviation(0) -{ -} - - - -RRTOptimization::RRTOptimization(RRTOptimization &o) -: IterOptimization(o) -{ - deviation = o.deviation; - deltaDeviation= o.deltaDeviation; - record = o.record; -} - - - -void RRTOptimization::zInitialize() -{ - IterOptimization::zInitialize(); - if( deviation<0 ) - { - - - int n; - - StatVar &v=problem.deviationStatVar(*this,ANZ_VERSCHLECHTERUNGEN); - - if( maxStep>0 ) - n=(int)(maxStep*4.0/5.0); - else - maxStep=n=(int)(problem.expectedNumberOfIterations()*defaultMultiple); - - deviation = v.quantil(defaultAnnRate); - deltaDeviation = deviation/(float)n; - - if( verboseMode>0 ) - cout << "#Algorithm: Record-To-Record-Travel: (anfAnnRate=" - << defaultAnnRate << ",T=" << deviation << ",deltaT=" - << deltaDeviation << ")\n"; - - curStep=0; - endFlag=0; - delete &v; - problem.initialize(); - IterOptimization::zInitialize(); - } - record=problem.value(); - assert(deviation>=0); -} - -short RRTOptimization::end() -{ - return ( endFlag>0 && deviation==0.0 ); -} -void RRTOptimization::abkuehlen() -{ - if( deviation>=0 ) - { - deviation -= deltaDeviation; - if(deviation<0) - deviation=0; - } -} -short RRTOptimization::accept(double delta) -{ - if( deviation<0 ) - return 1; - else - { - if( delta + curValue - deviation < record ) - { - if( delta + curValue < record ) - record = delta+curValue; - return 1; - } - else - return 0; - } -} - -void RRTOptimization::makeGraphOutput() -{ - IterOptimization::makeGraphOutput(); - *GraphOutput << deviation; -} - - - - -double RRTOptimization::optimizeValue(Problem &p,int proParameter,int numParameter,int typ, - int optimierungsschritte,int print) -{ - switch(typ) - { - case 1: - { - double bestPar=-1,best=1e100; - if( print ) - cout << "#RRT-optimizeValues: Quantil: " << numParameter << endl; - for(int i=0;i<=numParameter;i++) - { - StatVar end,laufzeit,init; - double now; - if(i==0) defaultAnnRate=0.2; - else defaultAnnRate = 0.3+(float)(0.6*i)/numParameter; - solveProblem(0,p,proParameter,optimierungsschritte,RRT_OPT,now, - end,laufzeit,init); - if( best>now ) - { - best=now; - bestPar=defaultAnnRate; - } - if( print ) - { - cout << defaultAnnRate << " "; - cout << end.getMean() << " " << end.quantil(0.2) << " " - << end.quantil(0.79) << " " << laufzeit.getMean() << " " - << end.quantil(0.0) << " " << end.getSigma() << " " - << end.getSigmaSmaller() << " " << end.getSigmaBigger() - << " " << now << endl; - } - } - if( print ) - cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit " - "Bester Sigma SigmaSmaller SigmaBigger\n"; - defaultAnnRate=0.8; - return bestPar; - } - break; - case 10: - { - double i; - double bestPar=-1,best=1e100; - StatVar end,laufzeit,init; - - if( print ) - cout << "#RRT-optimizeValues: defaultMultiple" << 8 << endl; - for(i=0.5;i<=10;i+=1.5) - { - double now; - defaultMultiple = i; - solveProblem(0,p,proParameter,optimierungsschritte,RRT_OPT,now, - end,laufzeit,init); - if( best>now ) - { - best=now; - bestPar=defaultMultiple; - } - if( print ) - { - cout << defaultMultiple << " "; - cout << end.getMean() << " " << end.quantil(0.2) << " " - << end.quantil(0.79) << " " << laufzeit.getMean() << " " - << end.quantil(0.0) << " " << end.getSigma() << " " - << end.getSigmaSmaller() << " " << end.getSigmaBigger() - << " " << now << endl; - } - } - if( print ) - cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit " - "Bester Sigma SigmaSmaller SigmaBigger\n"; - defaultMultiple=2.0; - return bestPar; - } - break; - default: - cerr << "Error: wrong parameter-type in RRTOptimization::optimizeValue (" - << typ << ")\n"; - exit(1); - } - return 1e100; -} - - diff --git a/ext/giza-pp/mkcls-v2/RRTOptimization.h b/ext/giza-pp/mkcls-v2/RRTOptimization.h deleted file mode 100644 index 42ec6e22..00000000 --- a/ext/giza-pp/mkcls-v2/RRTOptimization.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - - -#ifndef RRTOPTIMIZATION -#define RRTOPTIMIZATION -#include "IterOptimization.h" - -class RRTOptimization : public IterOptimization { - - - private: - double deviation; - double deltaDeviation; - double record; - - protected: - virtual void zInitialize(); - - - virtual short accept(double delta); - - - virtual void abkuehlen(); - - - virtual short end(); - - - virtual void makeGraphOutput(); - - - public: - RRTOptimization(Problem &p,double temperatur, - double deltaTemperatur,int maxIter=-1); - - - RRTOptimization(Problem &p,int maxIter=-1); - - - RRTOptimization(RRTOptimization &o); - - - static double optimizeValue(Problem &p,int proParameter, - int numParameter,int typ,int schritte= -1,int verbose=1); - - - static double defaultAnnRate; - - static double defaultMultiple; - -}; - -#endif diff --git a/ext/giza-pp/mkcls-v2/SAOptimization.cpp b/ext/giza-pp/mkcls-v2/SAOptimization.cpp deleted file mode 100644 index 6ae589a5..00000000 --- a/ext/giza-pp/mkcls-v2/SAOptimization.cpp +++ /dev/null @@ -1,280 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include -#include - -#include "SAOptimization.h" - -#include "ProblemTest.h" - -#define ALPHA 0.95 - -double SAOptimization::defaultAnfAnnRate=0.9; -double SAOptimization::defaultEndAnnRate=1e-9; -double SAOptimization::defaultMultiple=2.0; - - - -SAOptimization::SAOptimization(Problem &p,int m) -: IterOptimization(p,m), temperatur(-1) -{ -} - - - - -SAOptimization::SAOptimization(Problem &p,double t,double a,int s,int m) -: IterOptimization(p,m),temperatur(t), alpha(a),schrittzahl(s) -{ - assert(alpha<1); - assert(schrittzahl>0); - assert(t>0); -} - - -SAOptimization::SAOptimization(SAOptimization &o) -: IterOptimization(o) -{ - temperatur = o.temperatur; - endTemperatur = o.endTemperatur; - alpha = o.alpha; - schrittzahl = o.schrittzahl; - stepsForAbkuehlung = o.stepsForAbkuehlung; -} - - -void SAOptimization::zInitialize() -{ - IterOptimization::zInitialize(); - if( temperatur<0) - { - - - - StatVar &v=problem.deviationStatVar(*this,ANZ_VERSCHLECHTERUNGEN); - - if( maxStep>0 ) - stepsForAbkuehlung=(int)(maxStep*4.0/5.0); - else - maxStep=stepsForAbkuehlung=(int)(problem.expectedNumberOfIterations()* - defaultMultiple); - - temperatur = v.getMean()/log(1/defaultAnfAnnRate); - endTemperatur = v.getMean()/log(1/defaultEndAnnRate); - schrittzahl = (int)(stepsForAbkuehlung/(log(endTemperatur/temperatur)/ - log(ALPHA))); - if(schrittzahl==0)schrittzahl=1; - alpha = ALPHA; - - if( verboseMode ) - cout << "#Algorithm: Simulated Annealing(anfAnnRate=" - << defaultAnfAnnRate <<",(endAnnRate=" << defaultEndAnnRate - << ",T0=" << temperatur<< ",Te=" << endTemperatur<< ",schrittzahl=" - << schrittzahl<< ",stepsForAbkuehlung=" << stepsForAbkuehlung - << ")\n"; - curStep=0; - endFlag=0; - delete &v; - problem.initialize(); - IterOptimization::zInitialize(); - } -} - -short SAOptimization::end() -{ - if( temperatur>endTemperatur ) - bestStep = curStep; - if( endFlag>0 && temperatur=0) - { - if( curStep%schrittzahl == 0 ) - temperatur=temperatur * alpha; - if( curStep> stepsForAbkuehlung) - temperatur = 0; - } -} -short SAOptimization::accept(double delta) -{ - if( temperatur<0 ) - return 1; - else - { - if( delta > 0 ) - { - if( temperatur==0 ) - return 0; - else - { - double z=zufall01(); - assert(z!=0.0); - if(z==0.0) - z+=1e-20; - double e=exp(-delta/temperatur); - - - - return z+0.000000000001<=e; - } - } - else - return 1; - } -} - -void SAOptimization::makeGraphOutput() -{ - IterOptimization::makeGraphOutput(); - *GraphOutput << temperatur; -} - - - - -double SAOptimization::optimizeValue(Problem &p,int proParameter,int numParameter, - int typ,int optimierungsschritte,int print) -{ - switch(typ) - { - case 1: - { - double bestPar=-1,best=1e100; - double now; - if( print ) - cout << "#SA-optimizeValues: defaultAnfAnnRate" << endl; - for(int i=0;inow ) - { - best=now; - bestPar=defaultAnfAnnRate; - } - if( print ) - { - cout << defaultAnfAnnRate << " "; - cout << end.getMean() << " " << end.quantil(0.2) << " " - << end.quantil(0.79) << " " << laufzeit.getMean() << " " - << end.quantil(0.0) << " " << end.getSigma() << " " - << end.getSigmaSmaller() << " " << end.getSigmaBigger() - << " " << now << endl; - } - } - if( print ) - cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit " - "Bester Sigma SigmaSmaller SigmaBigger\n"; - defaultAnfAnnRate=0.9; - return bestPar; - } - break; - case 2: - { - double bestPar=-1,best=1e100; - double now; - if( print ) - cout << "#Optimierung von SA: defaultEndAnnRate" << endl; - for(int i=1;i<=numParameter;i++) - { - StatVar end,laufzeit,init; - defaultEndAnnRate=1/(pow(10.0,i)); - solveProblem(0,p,proParameter,optimierungsschritte,SA_OPT,now,end, - laufzeit,init); - if( best>now ) - { - best=now; - bestPar=defaultEndAnnRate; - } - if( print ) - { - cout << defaultEndAnnRate << " "; - cout << end.getMean() << " " << end.quantil(0.2) << " " - << end.quantil(0.79) << " " << laufzeit.getMean() << " " - << end.quantil(0.0) << " " << end.getSigma() << " " - << end.getSigmaSmaller() << " " << end.getSigmaBigger() - << " " << now << endl; - } - } - if( print ) - cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit " - "Bester Sigma SigmaSmaller SigmaBigger\n"; - defaultEndAnnRate=1/10000.0; - return bestPar; - } - break; - case 10: - { - double bestPar=-1,best=1e100; - - if( print ) - cout << "#SA-optimizeValues: defaultMultiple " << 8 << endl; - for(int i=1;i<=6;i++) - { - StatVar end,laufzeit,init; - double now; - defaultMultiple = i; - solveProblem(0,p,proParameter,optimierungsschritte,SA_OPT,now,end, - laufzeit,init); - if( best>now ) - { - best=now; - bestPar=defaultMultiple; - } - if( print ) - { - cout << defaultMultiple << " "; - cout << end.getMean() << " " << end.quantil(0.2) << " " - << end.quantil(0.79) << " " << laufzeit.getMean() << " " - << end.quantil(0.0) << " " << end.getSigma() << " " - << end.getSigmaSmaller() << " " << end.getSigmaBigger() - << " " << now << endl; - } - } - if( print ) - cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit " - "Bester Sigma SigmaSmaller SigmaBigger\n"; - defaultMultiple=2.0; - return bestPar; - } - break; - default: - cerr << "Error: wrong parameter-type in SAOptimization::optimizeValue (" - << typ << ")\n"; - exit(1); - } - return 1e100; -} - - - diff --git a/ext/giza-pp/mkcls-v2/SAOptimization.h b/ext/giza-pp/mkcls-v2/SAOptimization.h deleted file mode 100644 index 97c528b5..00000000 --- a/ext/giza-pp/mkcls-v2/SAOptimization.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - - -#ifndef SAOPTIMIZATION -#define SAOPTIMIZATION -#include "IterOptimization.h" - -class SAOptimization : public IterOptimization - { - - - private: - double temperatur; - double endTemperatur; - double alpha; - int schrittzahl; - int stepsForAbkuehlung; - - protected: - virtual void zInitialize(); - - - virtual short accept(double delta); - - - virtual void abkuehlen(); - - - virtual short end(); - - - virtual void makeGraphOutput(); - - - public: - SAOptimization(Problem &p,double temperatur,double alpha, - int schrittzahl,int maxIter=-1); - - - SAOptimization(Problem &p,int maxIter=-1); - - - SAOptimization(SAOptimization &o); - - - static double optimizeValue(Problem &p,int proParameter, - int numParameter,int typ, - int schritte= -1,int verbose=1); - - - static double defaultAnfAnnRate; - - static double defaultEndAnnRate; - - static double defaultMultiple; - - -}; -#endif - diff --git a/ext/giza-pp/mkcls-v2/StatVar.cpp b/ext/giza-pp/mkcls-v2/StatVar.cpp deleted file mode 100644 index dbd76cd0..00000000 --- a/ext/giza-pp/mkcls-v2/StatVar.cpp +++ /dev/null @@ -1,140 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - -#include "StatVar.h" -#include -#include - -double compareStatVarQuantil=-1; - -StatV::~StatV() {} - - -int doublecompare(const void *p,const void *j) -{ - if( *(double *)p == *(double *)j) - return 0; - if( *(double *)p- *(double *)j<0 ) - return -1; - else - return 1; -} - -int compareStatVar(const void *p,const void *j) -{ - double a; - double b; - if(compareStatVarQuantil>=0) - { - a=((StatVar *)p)->quantil(compareStatVarQuantil); - b=((StatVar *)j)->quantil(compareStatVarQuantil); - } - else - { - a=((StatVar *)p)->getMean(); - b=((StatVar *)j)->getMean(); - } - if(a==b) - return 0; - if(a0 ) - return sqrt(ss/ns); - else - return 0; -} -double StatVar::getSigmaBigger() -{ - double ss=0; - int ns=0; - for(int i=0;igetMean() ) - { - ss+=(values[i]-getMean())*(values[i]-getMean()); - ns++; - } - if( ss/ns>0 ) - return sqrt(ss/ns); - else - return 0; -} - - - -void StatV::dumpOn(ostream &strm) -{ - strm << "MEAN: " << getMean() << " (" << smallest << "-" << biggest - << ") SIGMA:" << getSigma()<< " "; -} - - - -double StatVar::quantil(double percent) -{ - int index=(int)(n*percent); - if(index==n) - index=n-1; - assert(index>=0&&index -#include -#include "Array.h" -#include "mystl.h" -#include "myleda.h" -#include - - -extern double compareStatVarQuantil; -int compareStatVar(const void *p,const void *j); - -class StatV - -{ - protected: - int n; - double sum; - double squareSum; - double smallest,biggest; - - public: - const char *title; - StatV() : n(0),sum(0),squareSum(0),smallest(1e100),biggest(-1e100),title("") {} - virtual ~StatV(); - - - virtual void addValue(double a) - { - n++; - sum+=a; - squareSum+=a*a; - if(smallest>a) - smallest=a; - if(biggest values; - short sortedFlag; - public: - StatVar() - : values(10,0.0,1),sortedFlag(0) {} - virtual ~StatVar(){} - double quantil(double percent=0.5); - - - inline double value(int i) - {return values[i];} - - - void printValues(ostream &strm); - - - virtual void addValue(double a) - { - sortedFlag=0; - values[n]=a; - StatV::addValue(a); - } - - double getSigmaSmaller(); - - - double getSigmaBigger(); - - -}; - - -#endif diff --git a/ext/giza-pp/mkcls-v2/TAOptimization.cpp b/ext/giza-pp/mkcls-v2/TAOptimization.cpp deleted file mode 100644 index 074ff62b..00000000 --- a/ext/giza-pp/mkcls-v2/TAOptimization.cpp +++ /dev/null @@ -1,208 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include "TAOptimization.h" -#include "ProblemTest.h" - - -double TAOptimization::defaultAnnRate=0.4; -double TAOptimization::defaultMultiple=2.0; - - -TAOptimization::TAOptimization(Problem &p,double t,double d,int m) -: IterOptimization(p,m) , temperatur(t) , deltaTemperatur(d) -{ - assert(t>0 && d>0); -} - - - -TAOptimization::TAOptimization(Problem&p,int m) -: IterOptimization(p,m), temperatur(-1) -{ -} - - - -TAOptimization::TAOptimization(TAOptimization &o) -: IterOptimization(o) -{ - temperatur= o.temperatur; - deltaTemperatur= o.deltaTemperatur; -} - - - - -void TAOptimization::zInitialize() -{ - IterOptimization::zInitialize(); - if( temperatur<0) - { - - - int n; - - StatVar &v=problem.deviationStatVar(*this,ANZ_VERSCHLECHTERUNGEN); - - if(maxStep>0) - n=(int)(maxStep*4.0/5.0); - else - maxStep=n=(int)(problem.expectedNumberOfIterations()*defaultMultiple); - - temperatur = v.quantil(defaultAnnRate); - deltaTemperatur = temperatur/n; - - if( verboseMode>0 ) - cout << "#TA: (anfAnnRate=" - << defaultAnnRate << ",T=" << temperatur << ",deltaT=" - << deltaTemperatur << ")\n"; - curStep=0; - endFlag=0; - delete &v; - } -} - - -short TAOptimization::end() -{ - - - if( temperatur>0 ) - { - endFlag=0; - bestStep=curStep; - } - return endFlag>0; -} - -short TAOptimization::accept(double delta) -{ - if( temperatur<0 ) - return 1; - else - if( delta < temperatur ) - return 1; - else - return 0; -} - -void TAOptimization::abkuehlen() -{ - if( temperatur>=0 ) - temperatur=(temperatur-deltaTemperatur>0)?(temperatur-deltaTemperatur):0; -} - -void TAOptimization::makeGraphOutput() -{ - IterOptimization::makeGraphOutput(); - *GraphOutput << temperatur; -} - - - - -double TAOptimization::optimizeValue(Problem &p,int proParameter,int numParameter,int typ, - int optimierungsschritte,int print) -{ - switch(typ) - { - case 1: - { - double bestPar=-1,best=1e100; - if(print)cout << "#TA-optimizeValues: " << numParameter << endl; - for(int i=0;i<=numParameter;i++) - { - StatVar end,laufzeit,init; - double now; - defaultAnnRate = (float)(i)/numParameter; - solveProblem(0,p,proParameter,optimierungsschritte,TA_OPT,now,end, - laufzeit,init); - if( best>now ) - { - best=now; - bestPar=defaultAnnRate; - } - if( print) - { - cout << defaultAnnRate << " "; - cout << end.getMean() << " " << end.quantil(0.2) << " " - << end.quantil(0.79) << " " << laufzeit.getMean() << " " - << end.quantil(0.0) << " " << end.getSigma() << " " - << end.getSigmaSmaller() << " " << end.getSigmaBigger() - << " " << now << endl; - } - } - if( print ) - cout << "#Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit Bester" - " Sigma SigmaSmaller SigmaBigger\n"; - defaultAnnRate=0.5; - return bestPar; - } - break; - case 10: - { - double bestPar=-1,best=1e100; - if( print ) - cout << "#TA-optimizeValues: defaultMultiple " << 10 << endl; - for(int i=1;i<=6;i++) - { - StatVar end,laufzeit,init; - double now; - defaultMultiple = i; - solveProblem(0,p,proParameter,optimierungsschritte,TA_OPT,now, - end,laufzeit,init); - if( best>now ) - { - best=now; - bestPar=defaultMultiple; - } - if( print ) - { - cout << defaultMultiple << " "; - cout << end.getMean() << " " << end.quantil(0.2) << " " - << end.quantil(0.79) << " " << laufzeit.getMean() << " " - << end.quantil(0.0) << " " << end.getSigma() << " " - << end.getSigmaSmaller() << " " << end.getSigmaBigger() - << " " << now << endl; - } - } - if( print ) - cout << "#Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit Bester Sigma " - " SigmaSmaller SigmaBigger\n"; - defaultMultiple=2.0; - return bestPar; - } - break; - default: - cerr << "Error: wrong parameter-type in TAOptimization::optimizeValue (" - << typ << ")\n"; - exit(1); - } - return 1e100; -} - - diff --git a/ext/giza-pp/mkcls-v2/TAOptimization.h b/ext/giza-pp/mkcls-v2/TAOptimization.h deleted file mode 100644 index 33823068..00000000 --- a/ext/giza-pp/mkcls-v2/TAOptimization.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - - - -#ifndef TAOPTIMIZATION -#define TAOPTIMIZATION - -#include "IterOptimization.h" - -class TAOptimization : public IterOptimization { - - - private: - double temperatur; - double deltaTemperatur; - - protected: - virtual void zInitialize(); - - - virtual short accept(double delta); - - - virtual void abkuehlen(); - - - virtual short end(); - - - virtual void makeGraphOutput(); - - - public: - TAOptimization(Problem &p,double temperatur, - double deltaTemperatur,int maxIter=-1); - - - TAOptimization(Problem &p,int maxIter=-1); - - - TAOptimization(TAOptimization &o); - - - static double optimizeValue(Problem &p,int proParameter, - int numParameter,int typ,int schritte= -1,int verbose=1); - - - static double defaultAnnRate; - - static double defaultMultiple; - -}; -#endif diff --git a/ext/giza-pp/mkcls-v2/general.cpp b/ext/giza-pp/mkcls-v2/general.cpp deleted file mode 100644 index ddd5fe48..00000000 --- a/ext/giza-pp/mkcls-v2/general.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - - -#include -#include - - -extern "C" { -#include -#include - - -} - -#include "general.h" - -extern "C" { -#ifndef __linux__ -int getrusage(int who, struct rusage *rusage); -#endif -}; -int verboseMode=0; - -#ifdef aNeXT -#define NO_TEMPLATES -#endif - - -void myerror(int line,const char *file,const char *expression) -{ - cerr << "(general.h):Assertion failed: '" << expression << "' ::: b " - << file << ":" << line << endl; -} - - -void imyerror(int line,const char *file,const char *expression) -{ - cerr << "Error: '" << expression << "' ::: in Source " << file - << ":" << line << endl; - #ifndef DEBUG - - #endif -} - - - -void zufallSeed(int z) -{ -#ifdef NeXT - srandom(z); -#else - srand48(z); -#endif -} - - - -double zufall01() -{ -#ifdef NeXT - return (double)(random()%65536)/65536.0; -#else - return drand48(); -#endif -} - - - -double zufall(double min,double max) -{ - double z=zufall01()*(max-min)+min; - assert(z>=min&&z=0); - assert(i -#ifdef NeXT -#include -#endif -#include - - - -#define NULLFLOAT(x) ( fabs(x)<=0.0000001 ) -#define EQUALFLOAT(x,y) ( fabs(x-y)<(fabs(x)+fabs(y))/10000000.0 ) - - - - -#define TEST_RANDOM_SEED 532567487 - -double zufall01(); - - -double zufall(double min,double max); - - -int randomInt(int exclusive); - - -void zufallSeed(int z =TEST_RANDOM_SEED); - - - - -#include "myassert.h" -#include -#include "Array.h" - - - - - - -double clockSec(); - -extern int verboseMode; - - - -inline string operator&(const string&a,const string&b) -{ - string c(a); - c+=b; - return c; -} - - - -#endif - diff --git a/ext/giza-pp/mkcls-v2/makePackage.sh b/ext/giza-pp/mkcls-v2/makePackage.sh deleted file mode 100644 index 2790e61a..00000000 --- a/ext/giza-pp/mkcls-v2/makePackage.sh +++ /dev/null @@ -1,43 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ -#! /bin/csh - -setenv VERSION `date +%Y-%m-%d` -rm -rf mkcls-v2 - -mkdir mkcls-v2 -foreach i ( Array.h FixedArray.h FlexArray.h GDAOptimization.C GDAOptimization.h HCOptimization.C HCOptimization.h IterOptimization.C IterOptimization.h KategProblem.C KategProblem.h KategProblemKBC.C KategProblemKBC.h KategProblemTest.C KategProblemTest.h KategProblemWBC.C KategProblemWBC.h MSBOptimization.C MSBOptimization.h MYOptimization.C MYOptimization.h Optimization.C Optimization.h PopOptimization.C PopOptimization.h Problem.C Problem.h ProblemTest.C ProblemTest.h RRTOptimization.C RRTOptimization.h SAOptimization.C SAOptimization.h StatVar.C StatVar.h TAOptimization.C TAOptimization.h general.C general.h makePackage.sh mkcls.C my.h myassert.h myleda.h mystl.h ) - cat $i | filterIfdef.out NO_LIGHT_GIZA | filterIfdefInverse.out DEBUG | filterIfdefInverse.out DEBUG_TRICKY_IBM3 | filterIfdefInverse.out VDEBUG | stripcmt | addHead.out -file header > mkcls-v2/$i -end - -cp Makefile.simple mkcls-v2/Makefile -cp ../giza++/GNU.GPL mkcls-v2 -cp ../giza++/LICENSE mkcls-v2 -cp README mkcls-v2 - -tar cf - mkcls-v2 | gzip -9 > mkcls.$VERSION.tar.gz - -cd mkcls-v2 -gmake -k -cd .. - diff --git a/ext/giza-pp/mkcls-v2/mkcls.cpp b/ext/giza-pp/mkcls-v2/mkcls.cpp deleted file mode 100644 index 90ebfde7..00000000 --- a/ext/giza-pp/mkcls-v2/mkcls.cpp +++ /dev/null @@ -1,618 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - - -#include -#include -#include -#include -#include "general.h" - -#include "KategProblem.h" -#include "KategProblemTest.h" - -#include "ProblemTest.h" -#include "TAOptimization.h" -#include "GDAOptimization.h" -#include "RRTOptimization.h" -#include "SAOptimization.h" -#include "HCOptimization.h" - - -double SigmaVerfaelschung=5.0; -int OneWithHapas=1; -char *hapaxInitName=0; - - - - - -static int nLaeufe=1,nLaeufeReduce=3; - - -static int optimizeParameterAnzahl=10; - - -static int IterOptVerf=TA_OPT; - - -static int MaxIterOptSteps= -1; - - -static int MaxSecs=0; - - - - - -static int InitValue=INIT_RAN; - - -static int Criterion=CRITERION_ML; - - -static int Wwahl=W_DET_DECR; - - -static int Kwahl=K_BEST; - - -static int NumberCategories=100; - - -static int MinWordFrequency=0; - - -static int IterOptSet=0; - - -static KategProblem *p = 0; - - -char korpusName[1024]="train"; -int korpusIsText=1; - - -char *FileForOther=0; - -void printUsage(int r) -{ - cout << - "mkcls - a program for making word classes: Usage: \n" - " mkcls [-nnum] [-ptrain] [-Vfile] opt\n" - - - - - - - "-V output classes (Default: no file)\n" - - - "-n number of optimization runs (Default: 1); larger number => better results\n" - - "-p filename of training corpus (Default: 'train')\n" - - - - - - - - - "Example:\n" - " mkcls -c80 -n10 -pin -Vout opt\n" - " (generates 80 classes for the corpus 'in' and writes the classes in 'out')\n" - "Literature: \n" - " Franz Josef Och: �Maximum-Likelihood-Sch�tzung von Wortkategorien mit Verfahren\n" - " der kombinatorischen Optimierung� Studienarbeit, Universit�t Erlangen-N�rnberg,\n" - " Germany,1995. \n"; - exit(r); -} - - - - - - - -void makeIterOpt() -{ - double maxTime=clockSec()+MaxSecs; - if(MaxSecs==0)maxTime=0; - double mean; - StatVar end,laufzeit,init; - solveProblem(1+(PrintBestTo!=0),*p,nLaeufe,MaxIterOptSteps,IterOptVerf, - mean,end,laufzeit,init,maxTime); - if( verboseMode>1 ) - p->dumpOn(cout); -} - - - -void makeIzrOpt() -{ - double maxTime=clockSec()+MaxSecs; - if(MaxSecs==0)maxTime=0; - izrOptimization(*p,nLaeufeReduce,nLaeufeReduce,0,maxTime,IterOptVerf); -} - - - -int makeMetaOpt(int argc,char **argv) -{ - int ret=0; - - if(argc==4 || argc==3) - { - int typ=0; - if( argc==4 ) - { - sscanf(argv[3],"%d",&typ); - assert(typ>0 && typ<=11 ); - } - if( isdigit(argv[2][0]) ) - { - int a; - sscanf(argv[2],"%d",&a); - switch(a) - { - case 1: - SAOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,1); - break; - case 2: - SAOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,2); - break; - case 3: - SAOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,10); - break; - case 4: - TAOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,1); - break; - case 5: - TAOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,10); - break; - case 6: - RRTOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,1); - break; - case 7: - RRTOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,10); - break; - case 8: - GDAOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,1); - break; - default: - cerr << "Error: Wrong number of parameter (" << argv[2] - << ").\n"; - printUsage(1); - } - } - else - { - if(strcasecmp(argv[2],"gda")==0) - { - GDAOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,typ); - } - else if(strcasecmp(argv[2],"ta")==0) - { - TAOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,typ); - } - else if(strcasecmp(argv[2],"rrt")==0) - { - RRTOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,typ); - } - else if(strcasecmp(argv[2],"sa")==0) - { - SAOptimization::optimizeValue(*p,nLaeufe, - optimizeParameterAnzahl,typ); - } - - - - - else - { - cerr << "Error: unknown algorithm" << argv[2] << endl; - printUsage(1); - } - } - } - else - { - cerr << "Error: wrong number of arguments: " << argc << endl; - printUsage(1); - } - return ret; -} - - - - - - - - - - -void setVerfahren(char *p) -{ - if(strcasecmp(p,"rrt")==0 ) - IterOptVerf=RRT_OPT; - else if(strcasecmp(p,"ta")==0) - IterOptVerf=TA_OPT; - else if(strcasecmp(p,"gda")==0) - IterOptVerf=GDA_OPT; - else if(strcasecmp(p,"sa")==0) - IterOptVerf=SA_OPT; - else if(strcasecmp(p,"hc")==0) - IterOptVerf=HC_OPT; - else - { - cerr << "Error: Unknown iterativ-optimizing algorithm '" << p << "'.\n"; - printUsage(1); - } -} - - - -void setInitValue(char *iv,char *fileForOther) -{ - if(strcasecmp(iv,"ran")==0 ) - InitValue=INIT_RAN; - else if(strcasecmp(iv,"aio")==0) - InitValue=INIT_AIO; - else if(strcasecmp(iv,"gda")==0) - InitValue=INIT_LWRW; - else if(strcasecmp(iv,"freq")==0) - InitValue=INIT_FREQ; - else if(strcasecmp(iv,"other")==0) - { - InitValue=INIT_OTHER; - FileForOther=strdup(fileForOther); - } - else - { - cerr << "Error: Unknown initialization '" << p << "'.\n";; - printUsage(1); - } -} - - -void setWwahl(const char *ww) -{ - if(strcasecmp(ww,"ran")==0 ) - Wwahl=W_RAN; - else if(strcasecmp(ww,"det")==0) - Wwahl=W_DET_DECR; - else if(strcasecmp(ww,"incr")==0) - Wwahl=W_DET_INCR; - else - { - cerr << "Error: Unknown word-selection '" << ww << "'.\n";; - printUsage(1); - } -} - - -void setKwahl(const char *kw) -{ - if( strcasecmp(kw,"det")==0 ) - Kwahl=K_DET; - else if(strcasecmp(kw,"ran")==0 ) - Kwahl=K_RAN; - else if(strcasecmp(kw,"best")==0) - Kwahl=K_BEST; - else - { - cerr << "Error: Unknown category-selection '" << kw << "'.\n"; - printUsage(1); - } -} - - -void setParameter(const char *nr1,const char *nr2) -{ - int n1; - float n2; - sscanf(nr1,"%d",&n1); - sscanf(nr2,"%f",&n2); - IterOptSet=1; - switch(n1) - { - case 1: - SAOptimization::defaultAnfAnnRate=n2; - if(verboseMode)cout << "Parameter gamma_0 (SA) set to " - << SAOptimization::defaultAnfAnnRate << endl; - iassert(0<=SAOptimization::defaultAnfAnnRate&& - SAOptimization::defaultAnfAnnRate<=1); - break; - case 2: - SAOptimization::defaultEndAnnRate=n2; - if(verboseMode)cout << "Parameter gamma_e (SA) set to " - << SAOptimization::defaultEndAnnRate << endl; - iassert(0<=SAOptimization::defaultEndAnnRate - &&SAOptimization::defaultEndAnnRate<=1); - break; - case 3: - SAOptimization::defaultMultiple=n2; - if(verboseMode)cout << "Parameter nu_e (SA) set to " - << SAOptimization::defaultMultiple << endl; - iassert( SAOptimization::defaultMultiple>0 ); - break; - case 4: - TAOptimization::defaultAnnRate=n2; - if(verboseMode)cout << "Parameter gamma_{TA} set to " - << TAOptimization::defaultAnnRate << endl; - iassert(0<=TAOptimization::defaultAnnRate - &&TAOptimization::defaultAnnRate<=1); - break; - case 5: - TAOptimization::defaultMultiple=n2; - if(verboseMode)cout << "Parameter nu_{TA} set to " - << TAOptimization::defaultMultiple << endl; - iassert( TAOptimization::defaultMultiple>0 ); - break; - case 6: - RRTOptimization::defaultAnnRate=n2; - if(verboseMode)cout << "Parameter gamma_{RRT} set to " - << RRTOptimization::defaultAnnRate << endl; - iassert(0<=RRTOptimization::defaultAnnRate - && RRTOptimization::defaultAnnRate<=1); - break; - case 7: - RRTOptimization::defaultMultiple=n2; - if(verboseMode)cout << "Parameter nu_{RRT} set to " - << RRTOptimization::defaultMultiple << endl; - iassert( RRTOptimization::defaultMultiple>0 ); - break; - case 8: - GDAOptimization::defaultAlpha=n2; - if(verboseMode)cout << "Parameter alpha set to " - << GDAOptimization::defaultAlpha << endl; - iassert(0<=GDAOptimization::defaultAlpha - && GDAOptimization::defaultAlpha<1 ); - break; - default: - cerr << "Error: Wrong parameter number " << nr1 << " " << n1 << endl; - printUsage(1); - } -} - - - -void setKorpusName(const char *s) -{ - strcpy(korpusName,s); -} - -void setHapaxInitName(const char *s) -{ - hapaxInitName=strdup(s); -} - -void setKorpus() -{ - if( korpusIsText ) - { - if( (p=fromKModel(korpusName,NumberCategories,InitValue,Criterion,Wwahl|Kwahl, - MinWordFrequency))==0) - { - cerr << "Error: Could not read the file '" << korpusName << "'.\n"; - printUsage(1); - } - } - else - { - if( (p=fromNgrFile(korpusName,NumberCategories,InitValue,Criterion,Wwahl|Kwahl, - MinWordFrequency))==0) - { - cerr << "Error: Could not read the file '" << korpusName << "'.\n"; - printUsage(1); - } - p->wordFreq.initializeIndex(*(p->words),'1',2,1+NumberCategories/2,!OneWithHapas); - p->wordFreq.initializeIndex(*(p->words),'2',2+NumberCategories/2,1+NumberCategories,OneWithHapas); - } - if( IterOptSet==0 ) - KategProblemSetParameters(*p); -} - - - - - - -int main(int argc,char **argv) -{ - double startTime=clockSec(); - zufallSeed(); - while( argc>1 && argv[1][0]=='-' ) - { - - switch(argv[1][1]) - { - case 'v': - sscanf(argv[1]+2,"%d",&verboseMode); - iassert(verboseMode>=0); - break; - case 'O': - sscanf(argv[1]+2,"%d",&OneWithHapas); - cout << "OneWithHapas: " << OneWithHapas << endl; - break; - case 'n': - sscanf(argv[1]+2,"%d",&nLaeufe); - nLaeufeReduce=nLaeufe; - iassert( nLaeufe>=1 ); - break; - case 'l': - Criterion=1; - if( argv[1][2] ) - { - sscanf(argv[1]+2,"%lf",&rhoLo); - if( verboseMode ) - cout << "Parameter rho (for LO) set to" << rhoLo << ".\n"; - iassert(0<=rhoLo && rhoLo<=1); - } - if( verboseMode ) - cout << "Criterion LO used.\n"; - break; - case 'y': - Criterion=2; - if( argv[1][2] ) - { - sscanf(argv[1]+2,"%lf",&SigmaVerfaelschung); - if( verboseMode ) - cout << "Parameter rho (for LO) set to" << SigmaVerfaelschung << ".\n"; - iassert(0=2); - break; - case 'm': - sscanf(argv[1]+2,"%d",&MinWordFrequency); - break; - case 'e': - setParameter(argv[1]+2,argv[2]); - argv++,argc--; - break; - case 'a': - setVerfahren(argv[1]+2); - break; - case 'r': - { - int s; - sscanf(argv[1]+2,"%d",&s); - zufallSeed(s); - } - break; - case 'V': - if(argv[1][2]) - { - char str[1024]; - strcpy(str,argv[1]+2); - PrintBestTo=new ofstream(str); - strcat(str,".cats"); - PrintBestTo2=new ofstream(str); - } - else - cout << "AUSGABE auf cout\n"; - break; - case 'M': - sscanf(argv[1]+2,"%d",&MaxIterOptSteps); - break; - case 's': - sscanf(argv[1]+2,"%d",&MaxSecs); - break; - case 'N': - sscanf(argv[1]+2,"%d",&optimizeParameterAnzahl); - break; - case 'o': - GraphOutput = new ofstream(argv[1]+2); - if( GraphOutput==0 ) - cerr << "Warning: Open failed for file '" << argv[1]+2 << "'.\n"; - break; - default: - cerr << "Fehlerhafte Option: " << argv[1] << endl; - printUsage(1); - } - argv++; - argc--; - } - - - setKorpus(); - if( FileForOther ) - { - fromCatFile(p,FileForOther); - p->initialisierung=InitValue; - p->_initialize(InitValue); - } - - if( hapaxInitName ) - { - fromCatFile(p,hapaxInitName,0); - p->fixInitLike(); - } - - double start2Time=clockSec(); - - if(argc>=2 && strcasecmp(argv[1],"opt")==0 ) - makeIterOpt(); - else if(argc>=2 && strcasecmp(argv[1],"meta-opt")==0) - makeMetaOpt(argc,argv); - else if(argc>=2 && strcasecmp(argv[1],"izr-opt")==0) - makeIzrOpt(); - - - else - { - makeIterOpt(); - } - - if( verboseMode ) - { - cout << " full-time: " << clockSec()-startTime << endl; - cout << "optimize-time: " << clockSec()-start2Time << endl; - } - return 0; -} - diff --git a/ext/giza-pp/mkcls-v2/my.h b/ext/giza-pp/mkcls-v2/my.h deleted file mode 100644 index ba06657d..00000000 --- a/ext/giza-pp/mkcls-v2/my.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - -#ifndef HEADER_my_DEFINED -#define HEADER_my_DEFINED - -#define over_array(a,i) for(i=(a).low();i<=(a).high();i++) -#define backwards_array(a,i) for(i=(a).high();i>=(a).low();i--) -#define over_arr(a,i) for(int i=(a).low();i<=(a).high();i++) -#define over_arrMAX(a,i,max) for(int i=(a).low();i<=min((a).high(),max-1);i++) -#define backwards_arr(a,i) for(int i=(a).high();i>=(a).low();i--) - -extern double n1mult,n2mult,n3mult; - -inline double realProb(int n1,int n2) -{ - massert(n1<=n2); - iassert(n1>=0&&n2>0); - if(n2==0)n2=1; - return ((double)n1)/(double)n2; -} - -inline double verfProb(int n1,int n2) -{ - double prob = realProb(n1,n2); - if( n1==1 )return prob*n1mult; - else if( n1==2 )return prob*n2mult; - else if( n1==3 )return prob*n3mult; - else return prob; -} - -#endif diff --git a/ext/giza-pp/mkcls-v2/myassert.h b/ext/giza-pp/mkcls-v2/myassert.h deleted file mode 100644 index da86ffb5..00000000 --- a/ext/giza-pp/mkcls-v2/myassert.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - -#ifndef MY_ASSERT_DEFINED -#define MY_ASSERT_DEFINED -void myerror(int line,const char *file,const char *expression); -void imyerror(int line,const char *file,const char *expression); - -#define iassert(expression) do {if (!(expression)) {imyerror(__LINE__,__FILE__,#expression);}} while (0) - -#define massert(expr) do {} while(0) - -#define vassert(expr) do {} while(0) - -#include - -#endif - - - - - diff --git a/ext/giza-pp/mkcls-v2/myleda.h b/ext/giza-pp/mkcls-v2/myleda.h deleted file mode 100644 index adf3845f..00000000 --- a/ext/giza-pp/mkcls-v2/myleda.h +++ /dev/null @@ -1,232 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - -#ifndef myleda_HEADER_defined -#define myleda_HEADER_defined -#include -#include -#include -#include "myassert.h" -#include "FixedArray.h" -using namespace std; - -template -class leda_array : public FixedArray -{ -public: - leda_array() {} - leda_array(int n) : FixedArray(n) {} -}; - -template -class leda_set : public set -{ -public: - bool member(const T&m) const - { return this->count(m)!=0; } - void del(const T&m) - { this->erase(m); } -}; -#define forall_set(a,b,c) for(a::iterator __i__=c.begin();__i__!=c.end()&&((b=*__i__),1);++__i__) -template -leda_set operator&(const leda_set&a,const leda_set&b) -{ - leda_setc; - insert_iterator > iter(c,c.begin()); - set_intersection(a.begin(),a.end(),b.begin(),b.end(),iter); - return c; -} -template -leda_set operator-(const leda_set&a,const leda_set&b) -{ - leda_setc; - insert_iterator > iter(c,c.begin()); - set_difference(a.begin(),a.end(),b.begin(),b.end(),iter); - return c; -} - -template -class leda_d_array : public map -{ -private: - B init; -public: - bool defined(const A&a) const - { return find(a)!=this->end(); } - const B&operator[](const A&a)const - { - typename map::const_iterator pos=find(a); - iassert(pos!=this->end()); - if( pos==this->end() ) - return init; - else - return pos->second; - } - B&operator[](const A&a) - { - typename map::iterator pos=find(a); - if( pos==this->end() ) - { - insert(map::value_type(a,init)); - pos=find(a); - iassert(pos!=this->end()); - } - return pos->second; - } -}; - -#define forall_defined_d(a,b,c,d) for(typename leda_d_array::const_iterator __ii__=(d).begin();__ii__!=(d).end()&&((c=__ii__->first),1) ;++__ii__) -#define forall_d(a,b,c,d) for(typename leda_d_array::const_iterator __ii__=(d).begin();__ii__!=(d).end()&&((c=__ii__->second),1);++__ii__) - -double used_time(); - -template -class my_hash -{ -public: - int operator()(const T&t)const {return Hash(t);} -}; - -inline int Hash(int value) { return value; } -#define MY_HASH_BASE std::unordered_map - -template -class leda_h_array : public MY_HASH_BASE -{ -private: - B init; -public: - leda_h_array() {} - leda_h_array(const B&_init) - : MY_HASH_BASE(),init(_init) {} - bool defined(const A&a) const - { return find(a)!=this->end(); } - const B&operator[](const A&a)const - { - typename MY_HASH_BASE::const_iterator pos=this->find(a); - - if( pos==this->end() ) - return init; - else - return pos->second; - } - B&operator[](const A&a) - { - typename MY_HASH_BASE::iterator pos=this->find(a); - if( pos==this->end() ) - { - this->insert(typename MY_HASH_BASE::value_type(a,init)); - pos=this->find(a); - iassert(pos!=this->end()); - } - return pos->second; - } -}; - -#define forall_defined_h(a,b,c,d) for(typename leda_h_array::const_iterator __jj__=(d).begin();__jj__!=(d).end()&&((c=__jj__->first),1); ++__jj__) -#define forall_defined_h2(a,b,c,d) for(leda_h_array::const_iterator __jj__=(d).begin();__jj__!=(d).end()&&((c=__jj__->first),1); ++__jj__) -#define forall_h(a,b,c,d) for(typename leda_h_array::const_iterator __jjj__=(d).begin();__jjj__!=(d).end()&&((c=__jjj__->second),1);++__jjj__) - - -template int compare(const T&a,const T&b) -{if(a==b)return 0; else if(a -ostream & operator<<(ostream&out,const leda_h_array&w) -{ - T t; - bool makeNl=0; - out << "h_array{"; - forall_defined_h(T,U,t,w) - { - if( makeNl ) - out << "\n "; - out << "EL:" << t << " INH:" << w[t] << "."; - makeNl=1; - } - return out << "}\n"; -} -template -ostream & operator<<(ostream&out,const leda_d_array&w) -{ - T t; - bool makeNl=0; - out << "h_array{"; - forall_defined_h(T,U,t,w) - { - if( makeNl ) - out << "\n "; - out << "EL:" << t << " INH:" << w[t] << "."; - makeNl=1; - } - return out << "}\n"; -} - -template -ostream&printSet(ostream&out,const leda_set&s) -{ - bool first=1; - T t; - out << "{"; - forall_set(typename set,t,s) - { - if( first==0 ) - out << ", "; - out << t; - first=0; - } - return out << "}\n"; -} - -template -istream & operator>>(istream&in,leda_h_array&) -{ - return in; -} - -template -bool operator==(const leda_h_array&p1,const leda_h_array&p2) -{ - A v; - forall_defined_h(A,B,v,p1) - if( !( p1[v]==p2[v]) ) return 0; - forall_defined_h(A,B,v,p2) - if( !( p1[v]==p2[v]) ) return 0; - return 1; -} -template -bool operator==(const leda_d_array&p1,const leda_d_array&p2) -{ - A v; - forall_defined_d(A,B,v,p1) - if( !( p1[v]==p2[v]) ) return 0; - forall_defined_d(A,B,v,p2) - if( !( p1[v]==p2[v]) ) return 0; - return 1; -} - - - -#endif diff --git a/ext/giza-pp/mkcls-v2/mystl.h b/ext/giza-pp/mkcls-v2/mystl.h deleted file mode 100644 index 99f79654..00000000 --- a/ext/giza-pp/mkcls-v2/mystl.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - -Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och - -mkcls - a program for making word classes . - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, -USA. - -*/ - - - -#ifndef MY_STL_H_DEFINED -#define MY_STL_H_DEFINED -#include -#include -#include -#include - -using namespace std; - -namespace std { - template - struct hash > { - static inline void hash_combine(std::size_t & seed, const T & v) { - hash hasher; - seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - - size_t operator()(const std::pair& x) const { - size_t h = 0; - hash_combine(h, x.first); - hash_combine(h, x.second); - return h; - } - }; -} - -#define over_string(a,i) for(unsigned int i=0;i -istream& operator>>(istream &in,pair &ir) -{ - char c; - do in.get(c); while (in && isspace(c)); - if (!in) return in; - if (c != '(') in.putback(c); - in >> ir.first; - do in.get(c); while (isspace(c)); - if (c != ',') in.putback(c); - in >> ir.second; - do in.get(c); while (c == ' '); - if (c != ')') in.putback(c); - return in; -} - -template -ostream& operator<<(ostream &out,const pair &ir) -{ - out << "(" << ir.first << "," << ir.second << ")"; - return out; -} - -void printSpaces(ostream&out,int n); -void mysplit(const string &s,string &s1,string &s2); -string untilChar(const string&s,char c); - -template -class tri -{ -public: - A a; - B b; - C c; - tri(){}; - tri(const A&_a,const B&_b,const C&_c) - : a(_a),b(_b),c(_c) {} -}; -template -bool operator==(const tri&x,const tri&y) -{ return x.a==y.a&&x.b==y.b&&x.c==y.c;} - -template -bool operator<(const tri&x,const tri&y) -{ - if(x.a -#include -#include -#include - -#include "cmd.h" - -#ifdef WIN32 -# define popen _popen -# define pclose _pclose -#endif - -static Enum_T BoolEnum[] = { - { "FALSE", 0 }, - { "TRUE", 1 }, - { 0, 0 } -}; - -#ifdef NEEDSTRDUP -char *strdup(); -#endif - -#define FALSE 0 -#define TRUE 1 - -#define LINSIZ 10240 -#define MAXPARAM 256 - -static char *GetLine(), - **str2array(); -static int Scan(), - SetParam(), - SetEnum(), - SetSubrange(), - SetStrArray(), - SetGte(), - SetLte(), - CmdError(), - EnumError(), - SubrangeError(), - GteError(), - LteError(), - PrintParam(), - PrintEnum(), - PrintStrArray(); - -static Cmd_T cmds[MAXPARAM+1]; -static char *SepString = " \t\n"; - -#if defined(__STDC__) -#include -int DeclareParams(char *ParName, ...) -#else -#include -int DeclareParams(ParName, va_alist) -char *ParName; -va_dcl -#endif -{ - va_list args; - static int ParamN = 0; - int j, - c; - char *s; - -#if defined(__STDC__) - va_start(args, ParName); -#else - va_start(args); -#endif - for(;ParName;) { - if(ParamN==MAXPARAM) { - fprintf(stderr, "Too many parameters !!\n"); - break; - } - for(j=0,c=1; jj; c--) { - cmds[c] = cmds[c-1]; - } - cmds[j].Name = ParName; - cmds[j].Type = va_arg(args, int); - cmds[j].Val = va_arg(args, void *); - switch(cmds[j].Type) { - case CMDENUMTYPE: /* get the pointer to Enum_T struct */ - cmds[j].p = va_arg(args, void *); - break; - case CMDSUBRANGETYPE: /* get the two extremes */ - cmds[j].p = (void*) calloc(2, sizeof(int)); - ((int*)cmds[j].p)[0] = va_arg(args, int); - ((int*)cmds[j].p)[1] = va_arg(args, int); - break; - case CMDGTETYPE: /* get lower or upper bound */ - case CMDLTETYPE: - cmds[j].p = (void*) calloc(1, sizeof(int)); - ((int*)cmds[j].p)[0] = va_arg(args, int); - break; - case CMDSTRARRAYTYPE: /* get the separators string */ - cmds[j].p = (s=va_arg(args, char*)) - ? (void*)strdup(s) : 0; - break; - case CMDBOOLTYPE: - cmds[j].Type = CMDENUMTYPE; - cmds[j].p = BoolEnum; - break; - case CMDDOUBLETYPE: /* nothing else is needed */ - case CMDINTTYPE: - case CMDSTRINGTYPE: - break; - default: - fprintf(stderr, "%s: %s %d %s \"%s\"\n", - "DeclareParam()", "Unknown Type", - cmds[j].Type, "for parameter", cmds[j].Name); - exit(1); - } - ParamN++; - ParName = va_arg(args, char *); - } - cmds[ParamN].Name = NULL; - va_end(args); - return 0; -} - -int GetParams(n, a, CmdFileName) -int *n; -char ***a; -char *CmdFileName; -{ - char *Line, - *ProgName; - int argc = *n; - char **argv = *a, - *s; - FILE *fp; - int IsPipe; - -#ifdef MSDOS -#define PATHSEP '\\' - char *dot = NULL; -#else -#define PATHSEP '/' -#endif - - if(!(Line=malloc(LINSIZ))) { - fprintf(stderr, "GetParams(): Unable to alloc %d bytes\n", - LINSIZ); - exit(1); - } - if((ProgName=strrchr(*argv, PATHSEP))) { - ++ProgName; - } else { - ProgName = *argv; - } -#ifdef MSDOS - if(dot=strchr(ProgName, '.')) *dot = 0; -#endif - --argc; - ++argv; - for(;;) { - if(argc && argv[0][0]=='-' && argv[0][1]=='=') { - CmdFileName = argv[0]+2; - ++argv; - --argc; - } - if(!CmdFileName) { - break; - } - IsPipe = !strncmp(CmdFileName, "@@", 2); - fp = IsPipe - ? popen(CmdFileName+2, "r") - : strcmp(CmdFileName, "-") - ? fopen(CmdFileName, "r") - : stdin; - if(!fp) { - fprintf(stderr, "Unable to open command file %s\n", - CmdFileName); - exit(1); - } - while(GetLine(fp, LINSIZ, Line) && strcmp(Line, "\\End")) { - if(Scan(ProgName, cmds, Line)) { - CmdError(Line); - } - } - if(fp!=stdin) { - if(IsPipe) pclose(fp); else fclose(fp); - } - CmdFileName = NULL; - } - while(argc && **argv=='-' && (s=strchr(*argv, '='))) { - *s = ' '; - sprintf(Line, "%s/%s", ProgName, *argv+1); - *s = '='; - if(Scan(ProgName, cmds, Line)) CmdError(*argv); - --argc; - ++argv; - } - *n = argc; - *a = argv; -#ifdef MSDOS - if(dot) *dot = '.'; -#endif - free(Line); - return 0; -} - -int PrintParams(ValFlag, fp) -int ValFlag; -FILE *fp; -{ - int i; - - fflush(fp); - if(ValFlag) { - fprintf(fp, "Parameters Values:\n"); - } else { - fprintf(fp, "Parameters:\n"); - } - for(i=0; cmds[i].Name; i++) PrintParam(cmds+i, ValFlag, fp); - fprintf(fp, "\n"); - fflush(fp); - return 0; -} - -int SPrintParams(a, pfx) -char ***a, - *pfx; -{ - int l, - n; - Cmd_T *cmd; - - if(!pfx) pfx=""; - l = strlen(pfx); - for(n=0, cmd=cmds; cmd->Name; cmd++) n += !!cmd->ArgStr; - a[0] = calloc(n, sizeof(char*)); - for(n=0, cmd=cmds; cmd->Name; cmd++) { - if(!cmd->ArgStr) continue; - a[0][n] = malloc(strlen(cmd->Name)+strlen(cmd->ArgStr)+l+2); - sprintf(a[0][n], "%s%s=%s", pfx, cmd->Name, cmd->ArgStr); - ++n; - } - return n; -} - -static int CmdError(opt) -char *opt; -{ - fprintf(stderr, "Invalid option \"%s\"\n", opt); - fprintf(stderr, "This program expectes the following parameters:\n"); - PrintParams(FALSE, stderr); - exit(0); -} - -static int PrintParam(cmd, ValFlag, fp) -Cmd_T *cmd; -int ValFlag; -FILE *fp; -{ - fprintf(fp, "%4s", ""); - switch(cmd->Type) { - case CMDDOUBLETYPE: - fprintf(fp, "%s", cmd->Name); - if(ValFlag) fprintf(fp, ": %22.15e", *(double *)cmd->Val); - fprintf(fp, "\n"); - break; - case CMDENUMTYPE: - PrintEnum(cmd, ValFlag, fp); - break; - case CMDINTTYPE: - case CMDSUBRANGETYPE: - case CMDGTETYPE: - case CMDLTETYPE: - fprintf(fp, "%s", cmd->Name); - if(ValFlag) fprintf(fp, ": %d", *(int *)cmd->Val); - fprintf(fp, "\n"); - break; - case CMDSTRINGTYPE: - fprintf(fp, "%s", cmd->Name); - if(ValFlag) { - if(*(char **)cmd->Val) { - fprintf(fp, ": \"%s\"", *(char **)cmd->Val); - } else { - fprintf(fp, ": %s", "NULL"); - } - } - fprintf(fp, "\n"); - break; - case CMDSTRARRAYTYPE: - PrintStrArray(cmd, ValFlag, fp); - break; - default: - fprintf(stderr, "%s: %s %d %s \"%s\"\n", - "PrintParam", - "Unknown Type", - cmd->Type, - "for parameter", - cmd->Name); - exit(1); - } - return 0; -} - -static char *GetLine(fp, n, Line) -FILE *fp; -int n; -char *Line; -{ - int j, - l, - offs=0; - - for(;;) { - if(!fgets(Line+offs, n-offs, fp)) { - return NULL; - } - if(Line[offs]=='#') continue; - l = strlen(Line+offs)-1; - Line[offs+l] = 0; - for(j=offs; Line[j] && isspace(Line[j]); j++, l--) - ; - if(l<1) continue; - if(j > offs) { - char *s = Line+offs, - *q = Line+j; - - while((*s++=*q++)) - ; - } - if(Line[offs+l-1]=='\\') { - offs += l; - Line[offs-1] = ' '; - } else { - break; - } - } - return Line; -} - -static int Scan(ProgName, cmds, Line) -char *ProgName, - *Line; -Cmd_T *cmds; -{ - char *q, - *p; - int i, - hl, - HasToMatch = FALSE, - c0, - c; - - p = Line+strspn(Line, SepString); - if(!(hl=strcspn(p, SepString))) { - return 0; - } - if((q=strchr(p, '/')) && q-pType != CMDSTRINGTYPE) { - fprintf(stderr, - "WARNING: No value specified for parameter \"%s\"\n", - cmd->Name); - return 0; - } - switch(cmd->Type) { - case CMDDOUBLETYPE: - if(sscanf(s, "%lf", (double*)cmd->Val)!=1) { - fprintf(stderr, - "Float value required for parameter \"%s\"\n", - cmd->Name); - exit(1); - } - break; - case CMDENUMTYPE: - SetEnum(cmd, s); - break; - case CMDINTTYPE: - if(sscanf(s, "%d", (int*)cmd->Val)!=1) { - fprintf(stderr, - "Integer value required for parameter \"%s\"\n", - cmd->Name); - exit(1); - } - break; - case CMDSTRINGTYPE: - *(char **)cmd->Val = (strcmp(s, "") && strcmp(s, "NULL")) - ? strdup(s) - : 0; - break; - case CMDSTRARRAYTYPE: - SetStrArray(cmd, s); - break; - case CMDGTETYPE: - SetGte(cmd, s); - break; - case CMDLTETYPE: - SetLte(cmd, s); - break; - case CMDSUBRANGETYPE: - SetSubrange(cmd, s); - break; - default: - fprintf(stderr, "%s: %s %d %s \"%s\"\n", - "SetParam", - "Unknown Type", - cmd->Type, - "for parameter", - cmd->Name); - exit(1); - } - cmd->ArgStr = strdup(s); - return 0; -} - -static int SetEnum(cmd, s) -Cmd_T *cmd; -char *s; -{ - Enum_T *en; - - for(en=(Enum_T *)cmd->p; en->Name; en++) { - if(*en->Name && !strcmp(s, en->Name)) { - *(int *) cmd->Val = en->Idx; - return 0; - } - } - return EnumError(cmd, s); -} - -static int SetSubrange(cmd, s) -Cmd_T *cmd; -char *s; -{ - int n; - - if(sscanf(s, "%d", &n)!=1) { - fprintf(stderr, - "Integer value required for parameter \"%s\"\n", - cmd->Name); - exit(1); - } - if(n < *(int *)cmd->p || n > *((int *)cmd->p+1)) { - return SubrangeError(cmd, n); - } - *(int *)cmd->Val = n; - return 0; -} - -static int SetGte(cmd, s) -Cmd_T *cmd; -char *s; -{ - int n; - - if(sscanf(s, "%d", &n)!=1) { - fprintf(stderr, - "Integer value required for parameter \"%s\"\n", - cmd->Name); - exit(1); - } - if(n<*(int *)cmd->p) { - return GteError(cmd, n); - } - *(int *)cmd->Val = n; - return 0; -} - -static int SetStrArray(cmd, s) -Cmd_T *cmd; -char *s; -{ - *(char***)cmd->Val = str2array(s, (char*)cmd->p); - return 0; -} - -static int SetLte(cmd, s) -Cmd_T *cmd; -char *s; -{ - int n; - - if(sscanf(s, "%d", &n)!=1) { - fprintf(stderr, - "Integer value required for parameter \"%s\"\n", - cmd->Name); - exit(1); - } - if(n > *(int *)cmd->p) { - return LteError(cmd, n); - } - *(int *)cmd->Val = n; - return 0; -} - -static int EnumError(cmd, s) -Cmd_T *cmd; -char *s; -{ - Enum_T *en; - - fprintf(stderr, - "Invalid value \"%s\" for parameter \"%s\"\n", s, cmd->Name); - fprintf(stderr, "Valid values are:\n"); - for(en=(Enum_T *)cmd->p; en->Name; en++) { - if(*en->Name) { - fprintf(stderr, " %s\n", en->Name); - } - } - fprintf(stderr, "\n"); - exit(1); -} - -static int GteError(cmd, n) -Cmd_T *cmd; -int n; -{ - fprintf(stderr, - "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); - fprintf(stderr, "Valid values must be greater than or equal to %d\n", - *(int *)cmd->p); - exit(1); -} - -static int LteError(cmd, n) -Cmd_T *cmd; -int n; -{ - fprintf(stderr, - "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); - fprintf(stderr, "Valid values must be less than or equal to %d\n", - *(int *)cmd->p); - exit(1); -} - -static int SubrangeError(cmd, n) -Cmd_T *cmd; -int n; -{ - fprintf(stderr, - "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); - fprintf(stderr, "Valid values range from %d to %d\n", - *(int *)cmd->p, *((int *)cmd->p+1)); - exit(1); -} - -static int PrintEnum(cmd, ValFlag, fp) -Cmd_T *cmd; -int ValFlag; -FILE *fp; -{ - Enum_T *en; - - fprintf(fp, "%s", cmd->Name); - if(ValFlag) { - for(en=(Enum_T *)cmd->p; en->Name; en++) { - if(*en->Name && en->Idx==*(int *)cmd->Val) { - fprintf(fp, ": %s", en->Name); - } - } - } - fprintf(fp, "\n"); - return 0; -} - -static int PrintStrArray(cmd, ValFlag, fp) -Cmd_T *cmd; -int ValFlag; -FILE *fp; -{ - char *indent, - **s = *(char***)cmd->Val; - int l = 4+strlen(cmd->Name); - - fprintf(fp, "%s", cmd->Name); - indent = malloc(l+2); - memset(indent, ' ', l+1); - indent[l+1] = 0; - if(ValFlag) { - fprintf(fp, ": %s", s ? (*s ? *s++ : "NULL") : ""); - if(s) while(*s) { - fprintf(fp, "\n%s %s", indent, *s++); - } - } - free(indent); - fprintf(fp, "\n"); - return 0; -} - -static char **str2array(s, sep) -char *s, - *sep; -{ - char *p, - **a; - int n = 0, - l; - - if(!sep) sep = SepString; - p = s += strspn(s, sep); - while(*p) { - p += strcspn(p, sep); - p += strspn(p, sep); - ++n; - } - a = calloc(n+1, sizeof(char *)); - p = s; - n = 0; - while(*p) { - l = strcspn(p, sep); - a[n] = malloc(l+1); - memcpy(a[n], p, l); - a[n][l] = 0; - ++n; - p += l; - p += strspn(p, sep); - } - return a; -} diff --git a/ext/symal/cmd.h b/ext/symal/cmd.h deleted file mode 100644 index 17231db9..00000000 --- a/ext/symal/cmd.h +++ /dev/null @@ -1,51 +0,0 @@ - -// $Id$ - -#if !defined(CMD_H) - -#define CMD_H - -#define CMDDOUBLETYPE 1 -#define CMDENUMTYPE 2 -#define CMDINTTYPE 3 -#define CMDSTRINGTYPE 4 -#define CMDSUBRANGETYPE 5 -#define CMDGTETYPE 6 -#define CMDLTETYPE 7 -#define CMDSTRARRAYTYPE 8 -#define CMDBOOLTYPE 9 - -typedef struct { - char *Name; - int Idx; -} Enum_T; - -typedef struct { - int Type; - char *Name, - *ArgStr; - void *Val, - *p; -} Cmd_T; - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(__STDC__) - int DeclareParams(char *, ...); -#else - int DeclareParams(); -#endif - - int GetParams(int *n, char ***a,char *CmdFileName), - SPrintParams(), - PrintParams(); - -#ifdef __cplusplus -} -#endif -#endif - - - diff --git a/ext/symal/giza2bal.pl b/ext/symal/giza2bal.pl deleted file mode 100755 index 553ff2b3..00000000 --- a/ext/symal/giza2bal.pl +++ /dev/null @@ -1,112 +0,0 @@ -#! /usr/bin/perl - -# $Id$ -#Converts direct and inverted alignments into a more compact -#bi-alignment format. It optionally reads the counting file -#produced by giza containing the frequency of each traning sentence. - -#Copyright Marcello Federico, November 2004 - -($cnt,$dir,$inv)=(); - -while ($w=shift @ARGV){ - $dir=shift(@ARGV),next if $w eq "-d"; - $inv=shift(@ARGV),next if $w eq "-i"; - $cnt=shift(@ARGV),next if $w eq "-c"; -} - -my $lc = 0; - -if (!$dir || !inv){ - print "usage: giza2bal.pl [-c ] -d -i \n"; - print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n"; - exit(0); -} - -$|=1; - -open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n"; -open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n"; - -if ($cnt){ -open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n"; -} - - -sub ReadBiAlign{ - local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_; - local($dummy,$n); - - chop($c=<$fd0>); ## count - $dummy=<$fd0>; ## header - $dummy=<$fd0>; ## header - $c=1 if !$c; - - $dummy=<$fd1>; ## header - chop($s1=<$fd1>); - chop($t1=<$fd1>); - - $dummy=<$fd2>; ## header - chop($s2=<$fd2>); - chop($t2=<$fd2>); - - @a=@b=(); - $lc++; - - #get target statistics - $n=1; - $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; - while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ - grep($a[$_]=$n,split(/\s+/,$2)); - $n++; - } - - $m=1; - $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; - while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ - grep($b[$_]=$m,split(/\s+/,$2)); - $m++; - } - - $M=split(/\s+/,$s1); - $N=split(/\s+/,$s2); - - if ($m != ($M+1) || $n != ($N+1)) { - print STDERR "Sentence mismatch error! Line #$lc\n"; - $s1 = "ALIGN_ERR"; - $s2 = "ALIGN_ERR"; - @a=(); @b=(); - for ($j=1;$j<2;$j++){ $a[$j]=1; } - for ($i=1;$i<2;$i++){ $b[$i]=1; } - return 1; - } - - for ($j=1;$j<$m;$j++){ - $a[$j]=0 if !$a[$j]; - } - - for ($i=1;$i<$n;$i++){ - $b[$i]=0 if !$b[$i]; - } - - - return 1; -} - -$skip=0; -$ccc=0; -while(!eof(DIR)){ - - if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c)) - { - $ccc++; - print "$c\n"; - print $#a," $src \# @a[1..$#a]\n"; - print $#b," $tgt \# @b[1..$#b]\n"; - } - else{ - print "\n"; - print STDERR "." if !(++$skip % 1000); - } -}; -print STDERR "skip=<$skip> counts=<$ccc>\n"; diff --git a/ext/symal/symal.cpp b/ext/symal/symal.cpp deleted file mode 100644 index 8f1bac05..00000000 --- a/ext/symal/symal.cpp +++ /dev/null @@ -1,503 +0,0 @@ -// $Id$ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "cmd.h" - -using namespace std; - -#define MAX_WORD 10000 // maximum lengthsource/target strings -#define MAX_M 200 // maximum length of source strings -#define MAX_N 200 // maximum length of target strings - -#define UNION 1 -#define INTERSECT 2 -#define GROW 3 -#define SRCTOTGT 4 -#define TGTTOSRC 5 -#define BOOL_YES 1 -#define BOOL_NO 0 - -#define END_ENUM { (char*)0, 0 } - -static Enum_T AlignEnum [] = { - { "union", UNION }, - { "u", UNION }, - { "intersect", INTERSECT}, - { "i", INTERSECT}, - { "grow", GROW }, - { "g", GROW }, - { "srctotgt", SRCTOTGT }, - { "s2t", SRCTOTGT }, - { "tgttosrc", TGTTOSRC }, - { "t2s", TGTTOSRC }, - END_ENUM -}; - -static Enum_T BoolEnum [] = { - { "true", BOOL_YES }, - { "yes", BOOL_YES }, - { "y", BOOL_YES }, - { "false", BOOL_NO }, - { "no", BOOL_NO }, - { "n", BOOL_NO }, - END_ENUM -}; - - - -// global variables and constants - -int* fa; //counters of covered foreign positions -int* ea; //counters of covered english positions -int** A; //alignment matrix with information symmetric/direct/inverse alignments - -int verbose=0; - -//read an alignment pair from the input stream. - -int lc = 0; - -int getals(fstream& inp,int& m, int *a,int& n, int *b) -{ - char w[MAX_WORD], dummy[10]; - int i,j,freq; - if (inp >> freq) { - ++lc; - //target sentence - inp >> n; - assert(n> setw(MAX_WORD) >> w; - if (strlen(w)>=MAX_WORD-1) { - cerr << lc << ": target len=" << strlen(w) << " is not less than MAX_WORD-1=" - << MAX_WORD-1 << endl; - assert(strlen(w)> dummy; //# separator - // inverse alignment - for (i=1; i<=n; i++) inp >> b[i]; - - //source sentence - inp >> m; - assert(m> setw(MAX_WORD) >> w; - if (strlen(w)>=MAX_WORD-1) { - cerr << lc << ": source len=" << strlen(w) << " is not less than MAX_WORD-1=" - << MAX_WORD-1 << endl; - assert(strlen(w)> dummy; //# separator - - // direct alignment - for (j=1; j<=m; j++) { - inp >> a[j]; - assert(0<=a[j] && a[j]<=n); - } - - //check inverse alignemnt - for (i=1; i<=n; i++) - assert(0<=b[i] && b[i]<=m); - - return 1; - - } else - return 0; -}; - - -//compute union alignment -int prunionalignment(fstream& out,int m,int *a,int n,int* b) -{ - - ostringstream sout; - - for (int j=1; j<=m; j++) - if (a[j]) - sout << j-1 << "-" << a[j]-1 << " "; - - for (int i=1; i<=n; i++) - if (b[i] && a[b[i]]!=i) - sout << b[i]-1 << "-" << i-1 << " "; - - //fix the last " " - string str = sout.str(); - if (str.length() == 0) - str = "\n"; - else - str.replace(str.length()-1,1,"\n"); - - out << str; - out.flush(); - - return 1; -} - - -//Compute intersection alignment - -int printersect(fstream& out,int m,int *a,int n,int* b) -{ - - ostringstream sout; - - for (int j=1; j<=m; j++) - if (a[j] && b[a[j]]==j) - sout << j-1 << "-" << a[j]-1 << " "; - - //fix the last " " - string str = sout.str(); - if (str.length() == 0) - str = "\n"; - else - str.replace(str.length()-1,1,"\n"); - - out << str; - out.flush(); - - return 1; -} - -//Compute target-to-source alignment - -int printtgttosrc(fstream& out,int m,int *a,int n,int* b) -{ - - ostringstream sout; - - for (int i=1; i<=n; i++) - if (b[i]) - sout << b[i]-1 << "-" << i-1 << " "; - - //fix the last " " - string str = sout.str(); - if (str.length() == 0) - str = "\n"; - else - str.replace(str.length()-1,1,"\n"); - - out << str; - out.flush(); - - return 1; -} - -//Compute source-to-target alignment - -int printsrctotgt(fstream& out,int m,int *a,int n,int* b) -{ - - ostringstream sout; - - for (int j=1; j<=m; j++) - if (a[j]) - sout << j-1 << "-" << a[j]-1 << " "; - - //fix the last " " - string str = sout.str(); - if (str.length() == 0) - str = "\n"; - else - str.replace(str.length()-1,1,"\n"); - - out << str; - out.flush(); - - return 1; -} - -//Compute Grow Diagonal Alignment -//Nice property: you will never introduce more points -//than the unionalignment alignemt. Hence, you will always be able -//to represent the grow alignment as the unionalignment of a -//directed and inverted alignment - -int printgrow(fstream& out,int m,int *a,int n,int* b, bool diagonal=false,bool final=false,bool bothuncovered=false) -{ - - ostringstream sout; - - vector > neighbors; //neighbors - - pair entry; - - neighbors.push_back(make_pair(-1,-0)); - neighbors.push_back(make_pair(0,-1)); - neighbors.push_back(make_pair(1,0)); - neighbors.push_back(make_pair(0,1)); - - - if (diagonal) { - neighbors.push_back(make_pair(-1,-1)); - neighbors.push_back(make_pair(-1,1)); - neighbors.push_back(make_pair(1,-1)); - neighbors.push_back(make_pair(1,1)); - } - - - int i,j,o; - - - //covered foreign and english positions - - memset(fa,0,(m+1)*sizeof(int)); - memset(ea,0,(n+1)*sizeof(int)); - - //matrix to quickly check if one point is in the symmetric - //alignment (value=2), direct alignment (=1) and inverse alignment - - for (int i=1; i<=n; i++) memset(A[i],0,(m+1)*sizeof(int)); - - set > currentpoints; //symmetric alignment - set > unionalignment; //union alignment - - pair point; //variable to store points - set >::const_iterator k; //iterator over sets - - //fill in the alignments - for (j=1; j<=m; j++) { - if (a[j]) { - unionalignment.insert(make_pair(a[j],j)); - if (b[a[j]]==j) { - fa[j]=1; - ea[a[j]]=1; - A[a[j]][j]=2; - currentpoints.insert(make_pair(a[j],j)); - } else - A[a[j]][j]=-1; - } - } - - for (i=1; i<=n; i++) - if (b[i] && a[b[i]]!=i) { //not intersection - unionalignment.insert(make_pair(i,b[i])); - A[i][b[i]]=1; - } - - - int added=1; - - while (added) { - added=0; - ///scan the current alignment - for (k=currentpoints.begin(); k!=currentpoints.end(); k++) { - //cout << "{"<< (k->second)-1 << "-" << (k->first)-1 << "}"; - for (o=0; ofirst+neighbors[o].first; - point.second=k->second+neighbors[o].second; - //cout << point.second-1 << " " << point.first-1 << "\n"; - //check if neighbor is inside 'matrix' - if (point.first>0 && point.first <=n && point.second>0 && point.second<=m) - //check if neighbor is in the unionalignment alignment - if (b[point.first]==point.second || a[point.second]==point.first) { - //cout << "In unionalignment ";cout.flush(); - //check if it connects at least one uncovered word - if (!(ea[point.first] && fa[point.second])) { - //insert point in currentpoints! - currentpoints.insert(point); - A[point.first][point.second]=2; - ea[point.first]=1; - fa[point.second]=1; - added=1; - //cout << "added grow: " << point.second-1 << "-" << point.first-1 << "\n";cout.flush(); - } - } - } - } - } - - if (final) { - for (k=unionalignment.begin(); k!=unionalignment.end(); k++) - if (A[k->first][k->second]==1) { - point.first=k->first; - point.second=k->second; - //one of the two words is not covered yet - //cout << "{" << point.second-1 << "-" << point.first-1 << "} "; - if ((bothuncovered && !ea[point.first] && !fa[point.second]) || - (!bothuncovered && !(ea[point.first] && fa[point.second]))) { - //add it! - currentpoints.insert(point); - A[point.first][point.second]=2; - //keep track of new covered positions - ea[point.first]=1; - fa[point.second]=1; - - //added=1; - //cout << "added final: " << point.second-1 << "-" << point.first-1 << "\n"; - } - } - - for (k=unionalignment.begin(); k!=unionalignment.end(); k++) - if (A[k->first][k->second]==-1) { - point.first=k->first; - point.second=k->second; - //one of the two words is not covered yet - //cout << "{" << point.second-1 << "-" << point.first-1 << "} "; - if ((bothuncovered && !ea[point.first] && !fa[point.second]) || - (!bothuncovered && !(ea[point.first] && fa[point.second]))) { - //add it! - currentpoints.insert(point); - A[point.first][point.second]=2; - //keep track of new covered positions - ea[point.first]=1; - fa[point.second]=1; - - //added=1; - //cout << "added final: " << point.second-1 << "-" << point.first-1 << "\n"; - } - } - } - - - for (k=currentpoints.begin(); k!=currentpoints.end(); k++) - sout << k->second-1 << "-" << k->first-1 << " "; - - - //fix the last " " - string str = sout.str(); - if (str.length() == 0) - str = "\n"; - else - str.replace(str.length()-1,1,"\n"); - - out << str; - out.flush(); - return 1; - - return 1; -} - - - -//Main file here - - -int main(int argc, char** argv) -{ - - int alignment=0; - char* input="/dev/stdin"; - char* output="/dev/stdout"; - int diagonal=false; - int final=false; - int bothuncovered=false; - - - DeclareParams("a", CMDENUMTYPE, &alignment, AlignEnum, - "alignment", CMDENUMTYPE, &alignment, AlignEnum, - "d", CMDENUMTYPE, &diagonal, BoolEnum, - "diagonal", CMDENUMTYPE, &diagonal, BoolEnum, - "f", CMDENUMTYPE, &final, BoolEnum, - "final", CMDENUMTYPE, &final, BoolEnum, - "b", CMDENUMTYPE, &bothuncovered, BoolEnum, - "both", CMDENUMTYPE, &bothuncovered, BoolEnum, - "i", CMDSTRINGTYPE, &input, - "o", CMDSTRINGTYPE, &output, - "v", CMDENUMTYPE, &verbose, BoolEnum, - "verbose", CMDENUMTYPE, &verbose, BoolEnum, - - (char *)NULL); - - GetParams(&argc, &argv, (char*) NULL); - - if (alignment==0) { - cerr << "usage: symal [-i=] [-o=] -a=[u|i|g] -d=[yes|no] -b=[yes|no] -f=[yes|no] \n" - << "Input file or std must be in .bal format (see script giza2bal.pl).\n"; - - exit(1); - - } - - fstream inp(input,ios::in); - fstream out(output,ios::out); - - if (!inp.is_open()) { - cerr << "cannot open " << input << "\n"; - exit(1); - } - - if (!out.is_open()) { - cerr << "cannot open " << output << "\n"; - exit(1); - } - - - int a[MAX_M],b[MAX_N],m,n; - fa=new int[MAX_M+1]; - ea=new int[MAX_N+1]; - - - int sents = 0; - A=new int *[MAX_N+1]; - for (int i=1; i<=MAX_N; i++) A[i]=new int[MAX_M+1]; - - switch (alignment) { - case UNION: - cerr << "symal: computing union alignment\n"; - while(getals(inp,m,a,n,b)) { - prunionalignment(out,m,a,n,b); - sents++; - } - cerr << "Sents: " << sents << endl; - break; - case INTERSECT: - cerr << "symal: computing intersect alignment\n"; - while(getals(inp,m,a,n,b)) { - printersect(out,m,a,n,b); - sents++; - } - cerr << "Sents: " << sents << endl; - break; - case GROW: - cerr << "symal: computing grow alignment: diagonal (" - << diagonal << ") final ("<< final << ")" - << "both-uncovered (" << bothuncovered <<")\n"; - - while(getals(inp,m,a,n,b)) - printgrow(out,m,a,n,b,diagonal,final,bothuncovered); - - break; - case TGTTOSRC: - cerr << "symal: computing target-to-source alignment\n"; - - while(getals(inp,m,a,n,b)) { - printtgttosrc(out,m,a,n,b); - sents++; - } - cerr << "Sents: " << sents << endl; - break; - case SRCTOTGT: - cerr << "symal: computing source-to-target alignment\n"; - - while(getals(inp,m,a,n,b)) { - printsrctotgt(out,m,a,n,b); - sents++; - } - cerr << "Sents: " << sents << endl; - break; - default: - exit(1); - } - - delete [] fa; - delete [] ea; - for (int i=1; i<=MAX_N; i++) delete [] A[i]; - delete [] A; - - exit(0); -} diff --git a/jni/kenlm_wrap.cc b/jni/kenlm_wrap.cc index 16cb54bc..64c9fe9e 100644 --- a/jni/kenlm_wrap.cc +++ b/jni/kenlm_wrap.cc @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include "lm/enumerate_vocab.hh" #include "lm/model.hh" #include "lm/left.hh" diff --git a/lib/.gitignore b/lib/.gitignore deleted file mode 100644 index 767c1061..00000000 --- a/lib/.gitignore +++ /dev/null @@ -1,13 +0,0 @@ -* -cache/ -!.gitignore -!BerkeleyParser.jar -!LICENSES -!README -!berkeleyaligner.jar -!berkeleylm.jar -!collections-generic-4.01.jar -!eng_sm6.gr -!ivy.xml -!ivysettings.xml -!jacana-xy.jar diff --git a/lib/BerkeleyParser.jar b/lib/BerkeleyParser.jar deleted file mode 100644 index 6a66023d..00000000 Binary files a/lib/BerkeleyParser.jar and /dev/null differ diff --git a/lib/LICENSES/LICENSE-jung.txt b/lib/LICENSES/LICENSE-jung.txt deleted file mode 100644 index 78f7ffa1..00000000 --- a/lib/LICENSES/LICENSE-jung.txt +++ /dev/null @@ -1,45 +0,0 @@ -According to http://jung.sourceforge.net/faq.html, "JUNG is licensed -and made freely available under the Berkeley Software Distribution -(BSD) license." This is confirmed by http://sourceforge.net/projects/jung. -However no license file is made available either on the website or -in any of the sources that can be downloaded. - -Therefore we assume this definition from -http://www.opensource.org/licenses/bsd-license.php; this should be -replaced by an actual license once one becomes available. - - - JUNG is licensed under a "BSD-style" license: - -Copyright (c) 2003--2009, JUNG Framework Development Team -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -* Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - -* Neither the name of the Java Universal Network/Graph Framework - nor the names of its contributors may be used to endorse or promote - products derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. diff --git a/lib/LICENSES/LICENSE-pmd.txt b/lib/LICENSES/LICENSE-pmd.txt deleted file mode 100644 index 4323694d..00000000 --- a/lib/LICENSES/LICENSE-pmd.txt +++ /dev/null @@ -1,36 +0,0 @@ -This licence is copied from http://pmd.sourceforge.net/license.html - - - PMD is licensed under a "BSD-style" license: - -Copyright (c) 2002-2009, InfoEther, Inc -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - * The end-user documentation included with the redistribution, if -any, must include the following acknowledgement: - "This product includes software developed in part by support from -the Defense Advanced Research Project Agency (DARPA)" - * Neither the name of InfoEther, LLC nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER -OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/lib/README b/lib/README deleted file mode 100644 index 588e75ad..00000000 --- a/lib/README +++ /dev/null @@ -1,26 +0,0 @@ -This file contains a listing of which packages in Joshua require -which jar files. - - -The "pmd" Ant task (not actually Joshua code) requires: -* pmd-4.2.5.jar -* jaxen-1.1.1.jar -* asm-3.1.jar - - -The "test" Ant task (not actually Joshua code) requires: -* testng-5.8-jdk15.jar - - -The joshua.ui.alignment_visualizer.* and joshua.ui.tree_visualizer.* -code requires: -* jung-api-2.0.jar -* jung-graph-impl-2.0.jar -* jung-algorithms-2.0.jar -* jung-visualization-2.0.jar -* collections-generic-4.01.jar - - -The joshua.subsample.* code requires: -* commons-cli-2.0-SNAPSHOT.jar -(But we hope to remove this dependency in the future.) diff --git a/lib/berkeleyaligner.jar b/lib/berkeleyaligner.jar deleted file mode 100644 index 63c4e961..00000000 Binary files a/lib/berkeleyaligner.jar and /dev/null differ diff --git a/lib/eng_sm6.gr b/lib/eng_sm6.gr deleted file mode 100644 index 4aa10360..00000000 Binary files a/lib/eng_sm6.gr and /dev/null differ diff --git a/lib/fastutil.jar b/lib/fastutil.jar deleted file mode 100644 index 5e4700d6..00000000 Binary files a/lib/fastutil.jar and /dev/null differ diff --git a/lib/ghkm-modified.jar b/lib/ghkm-modified.jar deleted file mode 100644 index f59fec42..00000000 Binary files a/lib/ghkm-modified.jar and /dev/null differ diff --git a/lib/ivy.xml b/lib/ivy.xml deleted file mode 100644 index d41595d6..00000000 --- a/lib/ivy.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - - - - - - - - - diff --git a/lib/ivysettings.xml b/lib/ivysettings.xml deleted file mode 100644 index a6fd635f..00000000 --- a/lib/ivysettings.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - - - - - - - - - - - - diff --git a/lib/jacana-xy.jar b/lib/jacana-xy.jar deleted file mode 100644 index 00e0ff6d..00000000 Binary files a/lib/jacana-xy.jar and /dev/null differ diff --git a/pom.xml b/pom.xml index 09400233..b5f7df73 100644 --- a/pom.xml +++ b/pom.xml @@ -15,75 +15,136 @@ See the License for the specific language governing permissions and limitations under the License. --> - - + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> 4.0.0 + + org.apache + apache + 10 + org.apache.joshua joshua jar 6.0.6-SNAPSHOT + Apache Joshua Machine Translation Toolkit + Joshua is an open-source statistical machine + translation decoder for phrase-based, hierarchical, + and syntax-based machine translation, written in Java. + + http://joshua.incubator.apache.org + 2016 + + + 1.7.21 + + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + + + + + The Apache Software Foundation + http://www.apache.org/ + + + + + lewismc + Lewis John McGibbney + lewismc [at] apache [dot] org + + Committer + PMC Member + + + + + + + Dev Mailing List + dev[at]joshua[dot]incubator[dot]apache[dot]org + dev-subscribe[at]joshua[dot]incubator[dot]apache[dot]org + dev-unsubscribe[at]joshua[dot]incubator[dot]apache[dot]org + http://mail-archives.apache.org/mod_mbox/incubator-joshua-dev/ + + + + User Mailing List + user[at]joshua[dot]incubator[dot]apache[dot]org + user-subscribe[at]joshua[dot]incubator[dot]apache[dot]org + user-unsubscribe[at]joshua[dot]incubator[dot]apache[dot]org + http://mail-archives.apache.org/mod_mbox/incubator-joshua-user/ + + + + Commits Mailing List + commits[at]joshua[dot]incubator[dot]apache[dot]org + commits-subscribe[at]joshua[dot]incubator[dot]apache[dot]org + commits-unsubscribe[at]joshua[dot]incubator[dot]apache[dot]org + http://mail-archives.apache.org/mod_mbox/incubator-joshua-commits/ + + + + + scm:git:http://git-wip-us.apache.org/repos/asf/incubator-joshua.git + scm:git:http://git-wip-us.apache.org/repos/asf/incubator-joshua.git + https://git-wip-us.apache.org/repos/asf/incubator-joshua.git + HEAD + + + JIRA + https://issues.apache.org/jira/browse/JOSHUA + + + Jenkins + https://builds.apache.org/job/joshua_master/ + + - src + install + target + ${basedir}/target/classes + ${project.artifactId}-${project.version} + ${basedir}/target/test-classes + ${basedir}/src/main/java + ${basedir}/src/test/java maven-compiler-plugin - 3.1 1.8 1.8 + + maven-assembly-plugin + + + + org.apache.joshua.decoder.JoshuaDecoder + + + + jar-with-dependencies + + + - - - - - - edu.berkeley.nlp berkeleylm 1.1.2 - asm - asm - 3.1 - true - - - com.amazonaws - aws-java-sdk - 1.1.3 - true - - - commons-cli commons-cli - 1.2 - - commons-logging - commons-logging - 1.1.1 - true - - - jaxen - jaxen - 1.1.1 - true - net.sf.jung jung-algorithms @@ -109,49 +170,49 @@ true - org.apache.hadoop - hadoop-core - 0.20.203.0 - true + com.google.guava + guava + 19.0 - org.testng - testng - 6.7 - true + com.google.code.gson + gson + 2.5 - org.mockito - mockito-all - 1.9.5 - true + args4j + args4j + 2.0.29 - pmd - pmd - 4.2.5 - true + org.slf4j + slf4j-api + ${slf4j.version} + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + + + junit junit 4.10 - true + test - com.google.guava - guava - 19.0 - - - com.google.code.gson - gson - 2.3 + org.testng + testng + 6.9.10 + test - args4j - args4j - 2.0.26 + org.mockito + mockito-core + 2.0.52-beta + test diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl index deb6ebc1..8c26dd82 100755 --- a/scripts/training/pipeline.pl +++ b/scripts/training/pipeline.pl @@ -1,5 +1,20 @@ #!/usr/bin/env perl +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # This script implements the Joshua pipeline. It can run a complete # pipeline --- from raw training corpora to bleu scores on a test set # --- and it allows jumping into arbitrary points of the pipeline. diff --git a/src/joshua/corpus/package.html b/src/joshua/corpus/package.html deleted file mode 100644 index 7643936b..00000000 --- a/src/joshua/corpus/package.html +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - - -Provides data structures for representing and manipulating corpora -and phrases extracted from corpora. - - - - - - diff --git a/src/joshua/decoder/chart_parser/package.html b/src/joshua/decoder/chart_parser/package.html deleted file mode 100644 index d7ca8f6f..00000000 --- a/src/joshua/decoder/chart_parser/package.html +++ /dev/null @@ -1,23 +0,0 @@ - - - - - - - -Provides an implementation of a hierarchical phrase-based decoder for statistical machine translation. - -

Related Documentation

- -
    -
  • The code in this package is based largely on algorithms from Chiang (2007). -
- - - - - diff --git a/src/joshua/decoder/ff/lm/bloomfilter_lm/package.html b/src/joshua/decoder/ff/lm/bloomfilter_lm/package.html deleted file mode 100644 index 883594ad..00000000 --- a/src/joshua/decoder/ff/lm/bloomfilter_lm/package.html +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - - -Provides an implementation of a bloom filter language model, and -an associated implementation of the language model feature function typically used in -hierarchical phrase-based decoding for statistical machine translation. - - - - - diff --git a/src/joshua/decoder/ff/lm/package.html b/src/joshua/decoder/ff/lm/package.html deleted file mode 100644 index b99a2450..00000000 --- a/src/joshua/decoder/ff/lm/package.html +++ /dev/null @@ -1,35 +0,0 @@ - - - - - - - -Provides abstraction and support for the language model feature function typically used in -hierarchical phrase-based decoding for statistical machine translation. - -The classes contained within this directory are responsible for two tasks: implementing the feature -function, and representing the language model itself. The class `LanguageModelFF` implements the -feature function by exending the class `DefaultStatefulFF`. One of these is instantiated for each -language model present in the decoder. - -The language models themselves are implemented as a combination of an interface -(`NGramLanguageModel`), a default implementation (`DefaultNgramLangaugeModel`), and an abstract -implementation of the default (`AbstractLM`). - -
-  DefaultStatefulFF
-  |- LanguageModelFF
-
-  DefaultNgramLanguageModel implements interface NGramLanguageModel
-  |- AbstractLM
-
- - - - - diff --git a/src/joshua/decoder/ff/package.html b/src/joshua/decoder/ff/package.html deleted file mode 100644 index b0aa63ed..00000000 --- a/src/joshua/decoder/ff/package.html +++ /dev/null @@ -1,37 +0,0 @@ - - - - - - - -Provides an implementation of the linear feature functions typically used in -hierarchical phrase-based decoding for statistical machine translation. - -The following is a note from Juri describing some of the functionality of the feature functions -interfaces and default abstract classes. - -
-The equality that I intended for is ff.transitionLogP() =
-ff.estimateLogP() + ff.reEstimateTransitionLogP(). The re-estimate
-fixes the estimate to be the true transition cost that takes into
-account the state. Before decoding the cost of applying a rule is
-estimated via estimateLogP() and yields the phrasal feature costs plus
-an LM estimate of the cost of the lexical portions of the rule.
-transitionLogP() takes rule and state and computes everything from
-scratch, whereas reEstimateTransitionLogP() adds in the cost of new
-n-grams that result from combining the rule with the LM states and
-subtracts out the cost of superfluous less-than-n-grams that were
-overridden by the updated cost calculation.
-
-Hope this helps.
-
- - - - - diff --git a/src/joshua/decoder/ff/tm/hash_based/package.html b/src/joshua/decoder/ff/tm/hash_based/package.html deleted file mode 100644 index 88ded5d2..00000000 --- a/src/joshua/decoder/ff/tm/hash_based/package.html +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - -Provides implementations of hierarchical phrase-based translation grammars. - - - - - diff --git a/src/joshua/decoder/ff/tm/package.html b/src/joshua/decoder/ff/tm/package.html deleted file mode 100644 index bf995947..00000000 --- a/src/joshua/decoder/ff/tm/package.html +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - -Defines interfaces and provides infrastructure for hierarchical phrase-based translation grammars. - - - - - diff --git a/src/joshua/decoder/hypergraph/package.html b/src/joshua/decoder/hypergraph/package.html deleted file mode 100644 index 6fdd043b..00000000 --- a/src/joshua/decoder/hypergraph/package.html +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - -Provides implementations of hypergraph data structures and related algorithms -used in extracting translation results in hierarchical phrase-based translation. - - - - - diff --git a/src/joshua/decoder/package.html b/src/joshua/decoder/package.html deleted file mode 100644 index fda252ea..00000000 --- a/src/joshua/decoder/package.html +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - -Provides infrastructure and wrapper code used relevant to -hierarchical phrase-based decoding for statistical machine translation. -

-This package does not include an implementation of any actual decoding algorithm. -Rather, such code is in child packages of this package. - - - - - \ No newline at end of file diff --git a/src/joshua/decoder/segment_file/package.html b/src/joshua/decoder/segment_file/package.html deleted file mode 100644 index 8f06ebc0..00000000 --- a/src/joshua/decoder/segment_file/package.html +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - -Provides common interfaces for parsing segment files (aka test corpora to be translated). In order to support constraint annotations, we provide a general API for use by JoshuaDecoder and Chart. - - - - - diff --git a/src/joshua/lattice/package.html b/src/joshua/lattice/package.html deleted file mode 100644 index a479be88..00000000 --- a/src/joshua/lattice/package.html +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - -Provides implementations of lattice and related data structures. - - - - - - diff --git a/src/joshua/oracle/package.html b/src/joshua/oracle/package.html deleted file mode 100644 index 0f670d30..00000000 --- a/src/joshua/oracle/package.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - -Provides for extracting the target string from a hypergraph that most closely matches a reference sentence. - - - - - - - \ No newline at end of file diff --git a/src/joshua/subsample/package.html b/src/joshua/subsample/package.html deleted file mode 100644 index bed439c5..00000000 --- a/src/joshua/subsample/package.html +++ /dev/null @@ -1,25 +0,0 @@ - - - - - - - -Provides executables Subsampler and AlignedSubsampler, for subsampling from large training corpora based on a test corpus. - - - - - - - diff --git a/src/joshua/ui/package.html b/src/joshua/ui/package.html deleted file mode 100644 index 2dcc44e2..00000000 --- a/src/joshua/ui/package.html +++ /dev/null @@ -1,25 +0,0 @@ - - - - - - - -Provides classes for visualizing parts of the translation process. - - - - - - - diff --git a/src/joshua/ui/tree_visualizer/tree/Tree.java b/src/joshua/ui/tree_visualizer/tree/Tree.java deleted file mode 100644 index 409e30a1..00000000 --- a/src/joshua/ui/tree_visualizer/tree/Tree.java +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package joshua.ui.tree_visualizer.tree; - -import java.util.Stack; -import java.util.regex.Pattern; -import java.util.regex.Matcher; -import java.util.List; -import java.util.ArrayList; -import java.util.Comparator; - -/** - * A class to represent the target-side tree produced by decoding using Joshua - * with an SCFG. - *

- * When decoding with use_tree_nbest=true, instead of a flat text output like - * "i asked her a question", we get a Penn treebank format tree like - * "(ROOT (S (NP i) (VP (V asked) (NP her) (NP (DT a) (N question)))))". - * If we also set include_align_index=true, we include source-side alignments - * for each internal node of the tree. - *

- * So, if the source input sentence is "je lui ai pose un question", if we - * turn on both configuration options, we end up with a decorated tree like - * this: - * "(ROOT{0-6} (S{0-6} (NP{0-1} i) (VP{1-6} (V{2-4} asked) (NP{1-2} her) - * (NP{4-6} (DT{4-5} a) (N{5-6} question)))))". - *

- * This class contains all the information of that flat string representation: - * the tree structure, the output (English) words, and the alignments to a - * source sentence. - *

- * Using a Tree the source sentence it was aligned to, we can create - * a DerivationTree object suitable for display. - * - * @author Jonny Weese - */ -public class Tree { - - /** - * An array holding the label of each node of the tree, in depth-first order. - * The label of a node means the NT label assigned to an internal node, or - * the terminal symbol (English word) at a leaf. - */ - private final String [] labels; - - /** - * The number of children of each node of the tree, in depth-first order. - */ - private final int [] numChildren; - - /** - * The smallest source-side index that each node covers, in depth-first order. - * Note that we only have this information for internal nodes. For leaves, - * this value will always be -1. - */ - private final int [] sourceStartIndices; - - /** - * 1 + the largest source-side index that each node covers, in depth-first - * order. Note that we only have this informaion for internal nodes. For - * leaves, this value will always be -1. - */ - private final int [] sourceEndIndices; - - /** - * A pattern to match an aligned internal node and pull out its information. - * This pattern matches: - * - * 1) start-of-string - * 2) ( - * 3) an arbitrary sequence of non-whitespace characters (at least 1) - * 4) { - * 5) a decimal number - * 6) - - * 7) a decimal number - * 8) } - * 9) end-of-string - * - * That is, it matches something like "(FOO{32-55}". The string and two - * decimal numbers (parts 3, 5, and 7) are captured in groups. - */ - private static final Pattern NONTERMINAL_PATTERN = - Pattern.compile("^\\((\\S+)\\{(\\d+)-(\\d+)\\}$"); - - /** - * Creates a Tree object from an input string in Penn treebank format with - * source alignment annotations. - */ - public Tree(String s) { - final String [] tokens = s.replaceAll("\\)", " )").split("\\s+"); - int numNodes = 0; - for (String t : tokens) { - if (!t.equals(")")) { - numNodes++; - } - } - labels = new String[numNodes]; - numChildren = new int[numNodes]; - sourceStartIndices = new int[numNodes]; - sourceEndIndices = new int[numNodes]; - try { - initialize(tokens); - } catch (Exception e) { - // This will catch most formatting errors. - throw new IllegalArgumentException( - String.format("couldn't create tree from string: \"%s\"", s), - e); - } - } - - private void initialize(String [] tokens) { - final Stack stack = new Stack(); - int nodeIndex = 0; - for (String token : tokens) { - final Matcher matcher = NONTERMINAL_PATTERN.matcher(token); - if (matcher.matches()) { - // new non-terminal node - labels[nodeIndex] = matcher.group(1); - sourceStartIndices[nodeIndex] = Integer.parseInt(matcher.group(2)); - sourceEndIndices[nodeIndex] = Integer.parseInt(matcher.group(3)); - stack.push(nodeIndex); - nodeIndex++; - } else if (token.equals(")")) { - // finished a subtree - stack.pop(); - if (stack.empty()) { - break; - } else { - numChildren[stack.peek()]++; - } - } else { - // otherwise, it's a new leaf node - labels[nodeIndex] = token; - sourceStartIndices[nodeIndex] = -1; - sourceEndIndices[nodeIndex] = -1; - numChildren[stack.peek()]++; - nodeIndex++; - } - } - if (!stack.empty()) { - // Not enough close-parentheses at the end of the tree. - throw new IllegalArgumentException(); - } - } - - /** - * Return the number of nodes in this Tree. - */ - public int size() { - return labels.length; - } - - /** - * Get the root Node of this Tree. - */ - public Node root() { - return new Node(0); - } - - private List childIndices(int index) { - List result = new ArrayList(); - int remainingChildren = numChildren[index]; - int childIndex = index + 1; - while (remainingChildren > 0) { - result.add(childIndex); - childIndex = nextSiblingIndex(childIndex); - remainingChildren--; - } - return result; - } - - private int nextSiblingIndex(int index) { - int result = index + 1; - int remainingChildren = numChildren[index]; - for (int i = 0; i < remainingChildren; i++) { - result = nextSiblingIndex(result); - } - return result; - } - - public String yield() { - String result = ""; - for (int i = 0; i < labels.length; i++) { - if (numChildren[i] == 0) { - if (!result.equals("")) { - result += " "; - } - result += labels[i]; - } - } - return result; - } - - @Override - public String toString() { - return root().toString(); - } - - /** - * A class representing the Nodes of a tree. - */ - public class Node { - - /** - * The index into the Tree class's internal arrays. - */ - private final int index; - - private Node(int i) { - index = i; - } - - /** - * Get the label for this node. If the node is internal to the tree, its - * label is the non-terminal label assigned to it. If it is a leaf node, - * the label is the English word at the leaf. - */ - public String label() { - return labels[index]; - } - - public boolean isLeaf() { - return numChildren[index] == 0; - } - - public int sourceStartIndex() { - return sourceStartIndices[index]; - } - - public int sourceEndIndex() { - return sourceEndIndices[index]; - } - - public List children() { - List result = new ArrayList(); - for (int j : childIndices(index)) { - result.add(new Node(j)); - } - return result; - } - - @Override - public String toString() { - if (isLeaf()) { - return label(); - } - String result = String.format("(%s{%d-%d}", - label(), - sourceStartIndex(), - sourceEndIndex()); - for (Node c : children()) { - result += String.format(" %s", c); - } - return result + ")"; - } - } - - public static class NodeSourceStartComparator implements Comparator { - public int compare(Node a, Node b) { - return a.sourceStartIndex() - b.sourceStartIndex(); - } - } -} diff --git a/src/joshua/util/io/package.html b/src/joshua/util/io/package.html deleted file mode 100644 index dd4c7522..00000000 --- a/src/joshua/util/io/package.html +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - -Provides common utility classes for IO. - - - - - - diff --git a/src/joshua/util/package.html b/src/joshua/util/package.html deleted file mode 100644 index c24e2357..00000000 --- a/src/joshua/util/package.html +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - -Provides common utility classes. - - - - - - diff --git a/src/joshua/zmert/package.html b/src/joshua/zmert/package.html deleted file mode 100644 index e3a0b2d6..00000000 --- a/src/joshua/zmert/package.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - -Provides code for performing minimum error rate training. - - -

Related Documentation

- -
    -
  • Much of the code in this package is based on Och (2003). -
  • A deeper description of the algorithm is in Zaidan (2009). -
- - - - - diff --git a/src/joshua/adagrad/AdaGrad.java b/src/main/java/org/apache/joshua/adagrad/AdaGrad.java similarity index 98% rename from src/joshua/adagrad/AdaGrad.java rename to src/main/java/org/apache/joshua/adagrad/AdaGrad.java index 61e90ad7..ac110858 100755 --- a/src/joshua/adagrad/AdaGrad.java +++ b/src/main/java/org/apache/joshua/adagrad/AdaGrad.java @@ -16,11 +16,11 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.adagrad; +package org.apache.joshua.adagrad; -import joshua.decoder.JoshuaConfiguration; -import joshua.util.FileUtility; -import joshua.util.StreamGobbler; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.util.FileUtility; +import org.apache.joshua.util.StreamGobbler; public class AdaGrad { public static void main(String[] args) throws Exception { diff --git a/src/joshua/adagrad/AdaGradCore.java b/src/main/java/org/apache/joshua/adagrad/AdaGradCore.java similarity index 92% rename from src/joshua/adagrad/AdaGradCore.java rename to src/main/java/org/apache/joshua/adagrad/AdaGradCore.java index e2958c6b..789757fd 100755 --- a/src/joshua/adagrad/AdaGradCore.java +++ b/src/main/java/org/apache/joshua/adagrad/AdaGradCore.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.adagrad; +package org.apache.joshua.adagrad; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -40,30 +40,35 @@ import java.util.Scanner; import java.util.TreeSet; import java.util.Vector; -import java.util.concurrent.ConcurrentHashMap; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.metrics.EvaluationMetric; -import joshua.util.StreamGobbler; -import joshua.corpus.Vocabulary; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.metrics.EvaluationMetric; +import org.apache.joshua.util.StreamGobbler; + +import EDU.oswego.cs.dl.util.concurrent.ConcurrentHashMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This code was originally written by Yuan Cao, who copied the MERT code to produce this file. */ public class AdaGradCore { - private final JoshuaConfiguration joshuaConfiguration; - private TreeSet[] indicesOfInterest_all; - - private final static DecimalFormat f4 = new DecimalFormat("###0.0000"); - private final Runtime myRuntime = Runtime.getRuntime(); + private static final Logger LOG = LoggerFactory.getLogger(AdaGradCore.class); private final static double NegInf = (-1.0 / 0.0); private final static double PosInf = (+1.0 / 0.0); private final static double epsilon = 1.0 / 1000000; + private final static DecimalFormat f4 = new DecimalFormat("###0.0000"); + + private final JoshuaConfiguration joshuaConfiguration; + private final Runtime myRuntime = Runtime.getRuntime(); + + private TreeSet[] indicesOfInterest_all; private int progress; @@ -328,12 +333,8 @@ private void initialize(int randsToSkip) { } inFile_names.close(); - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in AdaGradCore.initialize(int): " + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.initialize(int): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } // the parameter file contains one line per parameter @@ -391,12 +392,8 @@ private void initialize(int randsToSkip) { inFile_comm.close(); } } - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in AdaGradCore.initialize(int): " + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.initialize(int): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } // set static data members for the EvaluationMetric class @@ -717,12 +714,12 @@ public double[] run_single_iteration(int iteration, int minIts, int maxIts, int int[] candCount = new int[numSentences]; int[] lastUsedIndex = new int[numSentences]; - ConcurrentHashMap[] suffStats_array = new ConcurrentHashMap[numSentences]; + ConcurrentHashMap[] suffStats_array = new ConcurrentHashMap[numSentences]; for (int i = 0; i < numSentences; ++i) { candCount[i] = 0; lastUsedIndex[i] = -1; // suffStats_array[i].clear(); - suffStats_array[i] = new ConcurrentHashMap(); + suffStats_array[i] = new ConcurrentHashMap(); } // initLambda[0] is not used! @@ -1220,13 +1217,8 @@ public double[] run_single_iteration(int iteration, int minIts, int maxIts, int println("Number of features observed so far: " + numParams); println("", 1); - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in AdaGradCore.run_single_iteration(6): " - + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.run_single_iteration(6): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } // n-best list converges @@ -1500,17 +1492,11 @@ && fileExists(fakeFileNamePrefix + iteration + fakeFileNameSuffix)) { int decStatus = p.waitFor(); if (decStatus != validDecoderExitValue) { - println("Call to decoder returned " + decStatus + "; was expecting " + throw new RuntimeException("Call to decoder returned " + decStatus + "; was expecting " + validDecoderExitValue + "."); - System.exit(30); } - } catch (IOException e) { - System.err.println("IOException in AdaGradCore.run_decoder(int): " + e.getMessage()); - System.exit(99902); - } catch (InterruptedException e) { - System.err.println("InterruptedException in AdaGradCore.run_decoder(int): " - + e.getMessage()); - System.exit(99903); + } catch (IOException | InterruptedException e) { + throw new RuntimeException(e); } retSA[0] = decoderOutFileName; @@ -1608,13 +1594,8 @@ private void produceTempFiles(String nbestFileName, int iteration) { gzipFile(featsFileName); } - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in AdaGradCore.produceTempFiles(int): " - + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.produceTempFiles(int): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -1662,9 +1643,7 @@ private void createConfigFile(ArrayList params, String cfgFileName, inFile.close(); outFile.close(); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.createConfigFile(double[],String,String): " - + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -1674,9 +1653,7 @@ private void processParamFile() { try { inFile_init = new Scanner(new FileReader(paramsFileName)); } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in AdaGradCore.processParamFile(): " - + e.getMessage()); - System.exit(99901); + throw new RuntimeException(e); } String dummy = ""; @@ -1699,8 +1676,7 @@ private void processParamFile() { } else if (dummy.equals("Fix")) { isOptimizable[c] = false; } else { - println("Unknown isOptimizable string " + dummy + " (must be either Opt or Fix)"); - System.exit(21); + throw new RuntimeException("Unknown isOptimizable string " + dummy + " (must be either Opt or Fix)"); } if (!isOptimizable[c]) { // skip next two values @@ -1715,25 +1691,22 @@ private void processParamFile() { // set minRandValue[c] and maxRandValue[c] (range for random values) dummy = inFile_init.next(); if (dummy.equals("-Inf") || dummy.equals("+Inf")) { - println("minRandValue[" + c + "] cannot be -Inf or +Inf!"); - System.exit(21); + throw new RuntimeException("minRandValue[" + c + "] cannot be -Inf or +Inf!"); } else { minRandValue[c] = Double.parseDouble(dummy); } dummy = inFile_init.next(); if (dummy.equals("-Inf") || dummy.equals("+Inf")) { - println("maxRandValue[" + c + "] cannot be -Inf or +Inf!"); - System.exit(21); + throw new RuntimeException("maxRandValue[" + c + "] cannot be -Inf or +Inf!"); } else { maxRandValue[c] = Double.parseDouble(dummy); } // check for illogical values if (minRandValue[c] > maxRandValue[c]) { - println("minRandValue[" + c + "]=" + minRandValue[c] + " > " + maxRandValue[c] + throw new RuntimeException("minRandValue[" + c + "]=" + minRandValue[c] + " > " + maxRandValue[c] + "=maxRandValue[" + c + "]!"); - System.exit(21); } // check for odd values @@ -1784,40 +1757,34 @@ private void processParamFile() { normalizationOptions[2] = Vocabulary.id(pName); if (normalizationOptions[1] <= 0) { - println("Value for the absval normalization method must be positive."); - System.exit(21); + throw new RuntimeException("Value for the absval normalization method must be positive."); } if (normalizationOptions[2] == 0) { - println("Unrecognized feature name " + normalizationOptions[2] - + " for absval normalization method.", 1); - System.exit(21); + throw new RuntimeException("Unrecognized feature name " + normalizationOptions[2] + + " for absval normalization method."); } } else if (dummyA[0].equals("maxabsval")) { normalizationOptions[0] = 2; normalizationOptions[1] = Double.parseDouble(dummyA[1]); if (normalizationOptions[1] <= 0) { - println("Value for the maxabsval normalization method must be positive."); - System.exit(21); + throw new RuntimeException("Value for the maxabsval normalization method must be positive."); } } else if (dummyA[0].equals("minabsval")) { normalizationOptions[0] = 3; normalizationOptions[1] = Double.parseDouble(dummyA[1]); if (normalizationOptions[1] <= 0) { - println("Value for the minabsval normalization method must be positive."); - System.exit(21); + throw new RuntimeException("Value for the minabsval normalization method must be positive."); } } else if (dummyA[0].equals("LNorm")) { normalizationOptions[0] = 4; normalizationOptions[1] = Double.parseDouble(dummyA[1]); normalizationOptions[2] = Double.parseDouble(dummyA[2]); if (normalizationOptions[1] <= 0 || normalizationOptions[2] <= 0) { - println("Both values for the LNorm normalization method must be positive."); - System.exit(21); + throw new RuntimeException("Both values for the LNorm normalization method must be positive."); } } else { - println("Unrecognized normalization method " + dummyA[0] + "; " + throw new RuntimeException("Unrecognized normalization method " + dummyA[0] + "; " + "must be one of none, absval, maxabsval, and LNorm."); - System.exit(21); } // if (dummyA[0]) inFile_init.close(); @@ -1929,13 +1896,8 @@ private void processDocInfo() { } - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in AdaGradCore.processDocInfo(): " - + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.processDocInfo(): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -1970,12 +1932,8 @@ private boolean copyFile(String origFileName, String newFileName) { * inFile.close(); outFile.close(); */ return true; - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in AdaGradCore.copyFile(String,String): " - + e.getMessage()); - return false; } catch (IOException e) { - System.err.println("IOException in AdaGradCore.copyFile(String,String): " + e.getMessage()); + LOG.error(e.getMessage(), e); return false; } } @@ -2036,8 +1994,7 @@ public void finish() { outFile_lambdas.close(); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.finish(): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -2118,9 +2075,8 @@ else if (arg.length() > 0) { argsVector.add(paramA[opt]); } } else { - println("Malformed line in config file:"); - println(origLine); - System.exit(70); + String msg = "Malformed line in config file:" + origLine; + throw new RuntimeException(msg); } } @@ -2129,13 +2085,9 @@ else if (arg.length() > 0) { inFile.close(); } catch (FileNotFoundException e) { println("AdaGrad configuration file " + fileName + " was not found!"); - System.err.println("FileNotFoundException in AdaGradCore.cfgFileToArgsArray(String): " - + e.getMessage()); - System.exit(99901); + throw new RuntimeException(e); } catch (IOException e) { - System.err - .println("IOException in AdaGradCore.cfgFileToArgsArray(String): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } String[] argsArray = new String[argsVector.size()]; @@ -2213,14 +2165,12 @@ private void processArgsArray(String[] args, boolean firstTime) { } else if (option.equals("-rps")) { refsPerSen = Integer.parseInt(args[i + 1]); if (refsPerSen < 1) { - println("refsPerSen must be positive."); - System.exit(10); + throw new RuntimeException("refsPerSen must be positive."); } } else if (option.equals("-txtNrm")) { textNormMethod = Integer.parseInt(args[i + 1]); if (textNormMethod < 0 || textNormMethod > 4) { - println("textNormMethod should be between 0 and 4"); - System.exit(10); + throw new RuntimeException("textNormMethod should be between 0 and 4"); } } else if (option.equals("-p")) { paramsFileName = args[i + 1]; @@ -2240,8 +2190,7 @@ private void processArgsArray(String[] args, boolean firstTime) { } i += optionCount; } else { - println("Unknown metric name " + metricName + "."); - System.exit(10); + throw new RuntimeException("Unknown metric name " + metricName + "."); } } else if (option.equals("-docSet")) { String method = args[i + 1]; @@ -2286,32 +2235,27 @@ private void processArgsArray(String[] args, boolean firstTime) { docSubsetInfo[6] = Integer.parseInt(a2); i += 3; } else { - println("Unknown docSet method " + method + "."); - System.exit(10); + throw new RuntimeException("Unknown docSet method " + method + "."); } } else if (option.equals("-maxIt")) { maxMERTIterations = Integer.parseInt(args[i + 1]); if (maxMERTIterations < 1) { - println("maxIt must be positive."); - System.exit(10); + throw new RuntimeException("maxIt must be positive."); } } else if (option.equals("-minIt")) { minMERTIterations = Integer.parseInt(args[i + 1]); if (minMERTIterations < 1) { - println("minIt must be positive."); - System.exit(10); + throw new RuntimeException("minIt must be positive."); } } else if (option.equals("-prevIt")) { prevMERTIterations = Integer.parseInt(args[i + 1]); if (prevMERTIterations < 0) { - println("prevIt must be non-negative."); - System.exit(10); + throw new RuntimeException("prevIt must be non-negative."); } } else if (option.equals("-stopIt")) { stopMinIts = Integer.parseInt(args[i + 1]); if (stopMinIts < 1) { - println("stopIts must be positive."); - System.exit(10); + throw new RuntimeException("stopIts must be positive."); } } else if (option.equals("-stopSig")) { stopSigValue = Double.parseDouble(args[i + 1]); @@ -2322,20 +2266,17 @@ private void processArgsArray(String[] args, boolean firstTime) { else if (option.equals("-thrCnt")) { numOptThreads = Integer.parseInt(args[i + 1]); if (numOptThreads < 1) { - println("threadCount must be positive."); - System.exit(10); + throw new RuntimeException("threadCount must be positive."); } } else if (option.equals("-save")) { saveInterFiles = Integer.parseInt(args[i + 1]); if (saveInterFiles < 0 || saveInterFiles > 3) { - println("save should be between 0 and 3"); - System.exit(10); + throw new RuntimeException("save should be between 0 and 3"); } } else if (option.equals("-compress")) { compressFiles = Integer.parseInt(args[i + 1]); if (compressFiles < 0 || compressFiles > 1) { - println("compressFiles should be either 0 or 1"); - System.exit(10); + throw new RuntimeException("compressFiles should be either 0 or 1"); } } else if (option.equals("-opi")) { int opi = Integer.parseInt(args[i + 1]); @@ -2344,8 +2285,7 @@ else if (option.equals("-thrCnt")) { } else if (opi == 0) { oneModificationPerIteration = false; } else { - println("oncePerIt must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("oncePerIt must be either 0 or 1."); } } else if (option.equals("-rand")) { int rand = Integer.parseInt(args[i + 1]); @@ -2354,8 +2294,7 @@ else if (option.equals("-thrCnt")) { } else if (rand == 0) { randInit = false; } else { - println("randInit must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("randInit must be either 0 or 1."); } } else if (option.equals("-seed")) { if (args[i + 1].equals("time")) { @@ -2377,8 +2316,7 @@ else if (option.equals("-needShuffle")) { else if (shuffle == 0) needShuffle = false; else { - println("-needShuffle must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("-needShuffle must be either 0 or 1."); } } // average weights after each epoch or not @@ -2389,8 +2327,7 @@ else if (option.equals("-needAvg")) { else if (avg == 0) needAvg = false; else { - println("-needAvg must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("-needAvg must be either 0 or 1."); } } // return the best weight during tuning or not @@ -2401,8 +2338,7 @@ else if (option.equals("-returnBest")) { else if (retBest == 0) returnBest = false; else { - println("-returnBest must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("-returnBest must be either 0 or 1."); } } // mini-batch size @@ -2445,8 +2381,7 @@ else if (option.equals("-adagradIter")) { else if (option.equals("-scoreRatio")) { scoreRatio = Double.parseDouble(args[i + 1]); if (scoreRatio <= 0) { - println("-scoreRatio must be positive"); - System.exit(10); + throw new RuntimeException("-scoreRatio must be positive"); } } else if (option.equals("-needScaling")) { int scale = Integer.parseInt(args[i + 1]); @@ -2455,8 +2390,7 @@ else if (option.equals("-scoreRatio")) { else if (scale == 0) needScale = false; else { - println("-needScaling must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("-needScaling must be either 0 or 1."); } } else if (option.equals("-usePseudoCorpus")) { int use = Integer.parseInt(args[i + 1]); @@ -2465,8 +2399,7 @@ else if (scale == 0) else if (use == 0) usePseudoBleu = false; else { - println("-usePseudoCorpus must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("-usePseudoCorpus must be either 0 or 1."); } } else if (option.equals("-corpusDecay")) { R = Double.parseDouble(args[i + 1]); @@ -2478,8 +2411,7 @@ else if (option.equals("-cmd")) { } else if (option.equals("-passIt")) { int val = Integer.parseInt(args[i + 1]); if (val < 0 || val > 1) { - println("passIterationToDecoder should be either 0 or 1"); - System.exit(10); + throw new RuntimeException("passIterationToDecoder should be either 0 or 1"); } passIterationToDecoder = (val == 1) ? true : false; } else if (option.equals("-decOut")) { @@ -2491,35 +2423,30 @@ else if (option.equals("-cmd")) { } else if (option.equals("-N")) { sizeOfNBest = Integer.parseInt(args[i + 1]); if (sizeOfNBest < 1) { - println("N must be positive."); - System.exit(10); + throw new RuntimeException("N must be positive."); } } // Output specs else if (option.equals("-v")) { verbosity = Integer.parseInt(args[i + 1]); if (verbosity < 0 || verbosity > 4) { - println("verbosity should be between 0 and 4"); - System.exit(10); + throw new RuntimeException("verbosity should be between 0 and 4"); } } else if (option.equals("-decV")) { decVerbosity = Integer.parseInt(args[i + 1]); if (decVerbosity < 0 || decVerbosity > 1) { - println("decVerbosity should be either 0 or 1"); - System.exit(10); + throw new RuntimeException("decVerbosity should be either 0 or 1"); } } else if (option.equals("-fake")) { fakeFileNameTemplate = args[i + 1]; int QM_i = fakeFileNameTemplate.indexOf("?"); if (QM_i <= 0) { - println("fakeFileNameTemplate must contain '?' to indicate position of iteration number"); - System.exit(10); + throw new RuntimeException("fakeFileNameTemplate must contain '?' to indicate position of iteration number"); } fakeFileNamePrefix = fakeFileNameTemplate.substring(0, QM_i); fakeFileNameSuffix = fakeFileNameTemplate.substring(QM_i + 1); } else { - println("Unknown option " + option); - System.exit(10); + throw new RuntimeException("Unknown option " + option); } i += 2; @@ -2591,10 +2518,11 @@ else if (option.equals("-v")) { if (!canRunCommand && !canRunJoshua) { // can only run fake decoder if (!canRunFake) { - println("AdaGrad cannot decode; must provide one of: command file (for external decoder),"); - println(" source file (for Joshua decoder),"); - println(" or prefix for existing output files (for fake decoder)."); - System.exit(12); + String msg = "AdaGrad cannot decode; must provide one of:" + + " command file (for external decoder)," + + " source file (for Joshua decoder)," + + " or prefix for existing output files (for fake decoder)."; + throw new RuntimeException(msg); } int lastGoodIt = 0; @@ -2607,9 +2535,8 @@ else if (option.equals("-v")) { } if (lastGoodIt == 0) { - println("Fake decoder cannot find first output file " + throw new RuntimeException("Fake decoder cannot find first output file " + (fakeFileNamePrefix + 1 + fakeFileNameSuffix)); - System.exit(13); } else if (lastGoodIt < maxMERTIterations) { if (firstTime) println("Warning: can only run fake decoder; existing output files " @@ -2701,8 +2628,7 @@ private void set_docSubsetInfo(int[] info) { private void checkFile(String fileName) { if (!fileExists(fileName)) { - println("The file " + fileName + " was not found!"); - System.exit(40); + throw new RuntimeException("The file " + fileName + " was not found!"); } } @@ -2737,8 +2663,7 @@ private void gzipFile(String inputFileName, String gzippedFileName) { deleteFile(inputFileName); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.gzipFile(String,String): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -2770,8 +2695,7 @@ private void gunzipFile(String gzippedFileName, String outputFileName) { deleteFile(gzippedFileName); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.gunzipFile(String,String): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -2787,8 +2711,8 @@ private String createUnifiedRefFile(String prefix, int numFiles) { if (!checker.exists()) { checker = new File(prefix + ".1"); if (!checker.exists()) { - println("Can't find reference files."); - System.exit(50); + String msg = "Can't find reference files."; + throw new RuntimeException(msg); } else { prefix = prefix + "."; } @@ -2817,8 +2741,8 @@ private String createUnifiedRefFile(String prefix, int numFiles) { for (int r = 0; r < numFiles; ++r) { if (countLines(prefix + nextIndex) != lineCount) { - println("Line count mismatch in " + (prefix + nextIndex) + "."); - System.exit(60); + String msg = "Line count mismatch in " + (prefix + nextIndex) + "."; + throw new RuntimeException(msg); } InputStream inStream = new FileInputStream(new File(prefix + nextIndex)); inFile[r] = new BufferedReader(new InputStreamReader(inStream, "utf8")); @@ -2839,15 +2763,8 @@ private String createUnifiedRefFile(String prefix, int numFiles) { for (int r = 0; r < numFiles; ++r) { inFile[r].close(); } - } catch (FileNotFoundException e) { - System.err - .println("FileNotFoundException in AdaGradCore.createUnifiedRefFile(String,int): " - + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.createUnifiedRefFile(String,int): " - + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } return outFileName; @@ -3000,8 +2917,7 @@ private int countLines(String fileName) { inFile.close(); } catch (IOException e) { - System.err.println("IOException in AdaGradCore.countLines(String): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } return count; @@ -3022,9 +2938,7 @@ private int countNonEmptyLines(String fileName) { inFile.close(); } catch (IOException e) { - System.err - .println("IOException in AdaGradCore.countNonEmptyLines(String): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } return count; @@ -3156,8 +3070,8 @@ private double[] randomPerturbation(double[] origLambda, int i, double method, d randVal = randVal * sigma; // number in [-sigma,sigma] randVal = randVal * origLambda[c]; // number in [-sigma*orig[c],sigma*orig[c]] randVal = randVal + origLambda[c]; // number in - // [orig[c]-sigma*orig[c],orig[c]+sigma*orig[c]] - // = [orig[c]*(1-sigma),orig[c]*(1+sigma)] + // [orig[c]-sigma*orig[c],orig[c]+sigma*orig[c]] + // = [orig[c]*(1-sigma),orig[c]*(1+sigma)] retLambda[c] = randVal; } else { retLambda[c] = origLambda[c]; diff --git a/src/joshua/adagrad/Optimizer.java b/src/main/java/org/apache/joshua/adagrad/Optimizer.java similarity index 99% rename from src/joshua/adagrad/Optimizer.java rename to src/main/java/org/apache/joshua/adagrad/Optimizer.java index 496277fb..722c593f 100755 --- a/src/joshua/adagrad/Optimizer.java +++ b/src/main/java/org/apache/joshua/adagrad/Optimizer.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.adagrad; +package org.apache.joshua.adagrad; import java.util.Collections; import java.util.ArrayList; @@ -27,8 +27,8 @@ import java.util.Vector; import java.lang.Math; -import joshua.corpus.Vocabulary; -import joshua.metrics.EvaluationMetric; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.metrics.EvaluationMetric; // this class implements the AdaGrad algorithm public class Optimizer { diff --git a/src/joshua/corpus/AbstractPhrase.java b/src/main/java/org/apache/joshua/corpus/AbstractPhrase.java similarity index 99% rename from src/joshua/corpus/AbstractPhrase.java rename to src/main/java/org/apache/joshua/corpus/AbstractPhrase.java index 5f900049..b4637d4b 100644 --- a/src/joshua/corpus/AbstractPhrase.java +++ b/src/main/java/org/apache/joshua/corpus/AbstractPhrase.java @@ -16,9 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.corpus; - - +package org.apache.joshua.corpus; /** * This class provides a skeletal implementation of the base methods likely to be common to most or diff --git a/src/joshua/corpus/BasicPhrase.java b/src/main/java/org/apache/joshua/corpus/BasicPhrase.java similarity index 96% rename from src/joshua/corpus/BasicPhrase.java rename to src/main/java/org/apache/joshua/corpus/BasicPhrase.java index ef2f0576..f7f6be28 100644 --- a/src/joshua/corpus/BasicPhrase.java +++ b/src/main/java/org/apache/joshua/corpus/BasicPhrase.java @@ -5,14 +5,14 @@ * under the LGPL version 2.1. LGPL version 3 requires no special permission since it is compatible * with Apache License 2.0 */ -package joshua.corpus; +package org.apache.joshua.corpus; import java.util.ArrayList; /** * The simplest concrete implementation of Phrase. * - * @author wren ng thornton + * @author wren ng thornton wren@users.sourceforge.net * @version $LastChangedDate$ */ public class BasicPhrase extends AbstractPhrase { diff --git a/src/joshua/corpus/ContiguousPhrase.java b/src/main/java/org/apache/joshua/corpus/ContiguousPhrase.java similarity index 67% rename from src/joshua/corpus/ContiguousPhrase.java rename to src/main/java/org/apache/joshua/corpus/ContiguousPhrase.java index 25395777..af669b7c 100644 --- a/src/joshua/corpus/ContiguousPhrase.java +++ b/src/main/java/org/apache/joshua/corpus/ContiguousPhrase.java @@ -16,13 +16,11 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.corpus; - +package org.apache.joshua.corpus; import java.util.ArrayList; import java.util.List; - /** * ContiguousPhrase implements the Phrase interface by linking into indices within a corpus. This is * intended to be a very low-memory implementation of the class. @@ -33,37 +31,16 @@ */ public class ContiguousPhrase extends AbstractPhrase { - // =============================================================== - // Constants - // =============================================================== - - // =============================================================== - // Member variables - // =============================================================== - protected int startIndex; protected int endIndex; protected Corpus corpusArray; - // =============================================================== - // Constructor(s) - // =============================================================== - public ContiguousPhrase(int startIndex, int endIndex, Corpus corpusArray) { this.startIndex = startIndex; this.endIndex = endIndex; this.corpusArray = corpusArray; } - - // =============================================================== - // Public - // =============================================================== - - // =========================================================== - // Accessor methods (set/get) - // =========================================================== - /** * This method copies the phrase into an array of ints. This method should be avoided if possible. * @@ -77,23 +54,15 @@ public int[] getWordIDs() { return words; } - public int getWordID(int position) { return corpusArray.getWordID(startIndex + position); // return corpusArray.corpus[startIndex+position]; } - public int size() { return endIndex - startIndex; } - - // =========================================================== - // Methods - // =========================================================== - - /** * Gets all possible subphrases of this phrase, up to and including the phrase itself. For * example, the phrase "I like cheese ." would return the following: @@ -116,7 +85,6 @@ public List getSubPhrases() { return getSubPhrases(size()); } - /** * Returns a list of subphrases only of length maxLength or smaller. * @@ -136,7 +104,6 @@ public List getSubPhrases(int maxLength) { return phrases; } - /** * creates a new phrase object from the indexes provided. *

@@ -150,36 +117,9 @@ public Phrase subPhrase(int start, int end) { return new ContiguousPhrase(startIndex + start, startIndex + end, corpusArray); } - - // =============================================================== - // Protected - // =============================================================== - - // =============================================================== - // Methods - // =============================================================== - - - // =============================================================== - // Private - // =============================================================== - - // =============================================================== - // Methods - // =============================================================== - - - // =============================================================== - // Static - // =============================================================== - - - // =============================================================== - // Main - // =============================================================== - /** * Main contains test code + * @param args String array of arguments used to run this class. */ public static void main(String[] args) { diff --git a/src/joshua/corpus/Corpus.java b/src/main/java/org/apache/joshua/corpus/Corpus.java similarity index 84% rename from src/joshua/corpus/Corpus.java rename to src/main/java/org/apache/joshua/corpus/Corpus.java index d3a394c4..1a7d1b0d 100755 --- a/src/joshua/corpus/Corpus.java +++ b/src/main/java/org/apache/joshua/corpus/Corpus.java @@ -16,9 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.corpus; - - +package org.apache.joshua.corpus; /** * Corpus is an interface that contains methods for accessing the information within a monolingual @@ -36,6 +34,7 @@ public interface Corpus { // extends Externalizable { // =============================================================== /** + * @param position the position at which we want to obtain a word ID * @return the integer representation of the Word at the specified position in the corpus. */ int getWordID(int position); @@ -53,7 +52,7 @@ public interface Corpus { // extends Externalizable { /** * Gets the sentence index of each specified position. * - * @param position Index into the corpus + * @param positions Index into the corpus * @return array of the sentence indices associated with the specified positions in the corpus. */ int[] getSentenceIndices(int[] positions); @@ -62,6 +61,7 @@ public interface Corpus { // extends Externalizable { * Gets the position in the corpus of the first word of the specified sentence. If the sentenceID * is outside of the bounds of the sentences, then it returns the last position in the corpus + 1. * + * @param sentenceID a specific sentence to obtain a position for * @return the position in the corpus of the first word of the specified sentence. If the * sentenceID is outside of the bounds of the sentences, then it returns the last position * in the corpus + 1. @@ -71,6 +71,7 @@ public interface Corpus { // extends Externalizable { /** * Gets the exclusive end position of a sentence in the corpus. * + * @param sentenceID a specific sentence to obtain an end position for * @return the position in the corpus one past the last word of the specified sentence. If the * sentenceID is outside of the bounds of the sentences, then it returns one past the last * position in the corpus. @@ -115,7 +116,7 @@ public interface Corpus { // extends Externalizable { * @param phrase the superphrase that the comparsion phrase is drawn from * @param phraseStart the point in the phrase where the comparison begins (inclusive) * @param phraseEnd the point in the phrase where the comparison ends (exclusive) - * @return an int that follows the conventions of java.util.Comparator.compareTo() + * @return an int that follows the conventions of {@link java.util.Comparator#compare(Object, Object)} */ int comparePhrase(int corpusStart, Phrase phrase, int phraseStart, int phraseEnd); @@ -124,9 +125,9 @@ public interface Corpus { // extends Externalizable { * Compares the phrase that starts at position start with the phrase passed in. Compares the * entire phrase. * - * @param corpusStart - * @param phrase - * @return + * @param corpusStart position start + * @param phrase {@link org.apache.joshua.corpus.Phrase} to compare against + * @return an int that follows the conventions of {@link java.util.Comparator#compare(Object, Object)} */ int comparePhrase(int corpusStart, Phrase phrase); @@ -136,15 +137,15 @@ public interface Corpus { // extends Externalizable { * @param position1 the position in the corpus where the first suffix begins * @param position2 the position in the corpus where the second suffix begins * @param maxComparisonLength a cutoff point to stop the comparison - * @return an int that follows the conventions of java.util.Comparator.compareTo() + * @return an int that follows the conventions of {@link java.util.Comparator#compare(Object, Object)} */ int compareSuffixes(int position1, int position2, int maxComparisonLength); /** * - * @param startPosition - * @param endPosition - * @return + * @param startPosition start position for phrase + * @param endPosition end position for phrase + * @return the {@link org.apache.joshua.corpus.ContiguousPhrase} */ ContiguousPhrase getPhrase(int startPosition, int endPosition); diff --git a/src/joshua/corpus/Phrase.java b/src/main/java/org/apache/joshua/corpus/Phrase.java similarity index 93% rename from src/joshua/corpus/Phrase.java rename to src/main/java/org/apache/joshua/corpus/Phrase.java index ba46220c..5a06a8b8 100644 --- a/src/joshua/corpus/Phrase.java +++ b/src/main/java/org/apache/joshua/corpus/Phrase.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.corpus; +package org.apache.joshua.corpus; import java.util.ArrayList; import java.util.List; @@ -93,6 +93,9 @@ public interface Phrase extends Comparable { * complete Phrase List. * * @see ArrayList#subList(int, int) + * @param start start position to begin new phrase + * @param end end position to end new phrase + * @return a new {@link org.apache.joshua.corpus.Phrase} object from the indexes provided. */ Phrase subPhrase(int start, int end); diff --git a/src/joshua/corpus/Span.java b/src/main/java/org/apache/joshua/corpus/Span.java similarity index 95% rename from src/joshua/corpus/Span.java rename to src/main/java/org/apache/joshua/corpus/Span.java index a51a9d23..414fe957 100644 --- a/src/joshua/corpus/Span.java +++ b/src/main/java/org/apache/joshua/corpus/Span.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.corpus; +package org.apache.joshua.corpus; import java.util.ArrayList; import java.util.Iterator; @@ -90,8 +90,8 @@ public boolean strictlyContainedIn(Span o) { /** * Returns true if the other span does not intersect with this one. - * @param o - * @return + * @param o new {@link org.apache.joshua.corpus.Span} to check for intersection + * @return true if the other span does not intersect with this one */ public boolean disjointFrom(Span o) { if (start < o.start) { diff --git a/src/main/java/org/apache/joshua/corpus/SymbolTable.java b/src/main/java/org/apache/joshua/corpus/SymbolTable.java new file mode 100644 index 00000000..274e8b9b --- /dev/null +++ b/src/main/java/org/apache/joshua/corpus/SymbolTable.java @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.corpus; + +import java.util.Collection; + +/** + * Represents a symbol table capable of mapping between strings and + * symbols. + * + * @author Lane Schwartz + * @author Zhifei Li + * @version $LastChangedDate: 2009-11-24 23:07:43 -0600 (Tue, 24 Nov 2009) $ + */ +public interface SymbolTable { + + //TODO Remove all hard-coded references to nonterminals + + /** + * The unknown word's ID will be the size of the vocabulary, + * ensuring that it is outside of the vocabulary. Note that + * for vocabularies which have not been fixed yet, this + * means the actual value is volatile and therefore a word + * ID can only be compared against UNKNOWN_WORD at the time + * the word ID is generated (otherwise unknown words can + * become "known" if new words are added to the vocabulary + * before testing). + *

+ * Negative IDs are reserved for non-terminals. + * + * Zero is reserved as the UNKNOWN_WORD. + */ + int UNKNOWN_WORD = 1; + + /** String representation for out-of-vocabulary words. */ + String UNKNOWN_WORD_STRING = ""; + + /** + * Integer representation of the bare (non-indexed) nonterminal X, + * which represents a wild-card gap in a phrase. + *

+ * All nonterminals are guaranteed to be represented by negative integers. + */ + int X = -1; + + /** + * String representation of the bare (non-indexed) nonterminal X, + * which represents a wild-card gap in a phrase. + */ + String X_STRING = "[X]"; + + + + /** + * String representation of the nonterminal X with index 1, + * which represents a wild-card gap in a phrase. + */ + String X1_STRING = "[X,1]"; + + + + /** + * String representation of the nonterminal X with index 2, + * which represents a wild-card gap in a phrase. + */ + String X2_STRING = "[X,2]"; + + /** + * Integer representation of the nonterminal S. + *

+ * All nonterminals are guaranteed to be represented by negative integers. + */ + int S = -4; + + /** + * String representation of the nonterminal S.. + */ + String S_STRING = "[S]"; + + /** + * Integer representation of the nonterminal X with index 1, + * which represents a wild-card gap in a phrase. + *

+ * All nonterminals are guaranteed to be represented by negative integers. + */ + int S1 = -5; + + /** + * String representation of the nonterminal X with index 2, + * which represents a wild-card gap in a phrase. + */ + String S1_STRING = "[S,1]"; + + /** + * Gets a unique integer identifier for the nonterminal. + *

+ * The integer returned is guaranteed to be a negative number. + * + * If the nonterminal is {@link #X_STRING}, + * then the value returned must be {@link #X}. + * + * Otherwise, the value returned must be a negative number + * whose value is less than {@link X}. + * + * @param nonterminal Nonterminal symbol + * @return a unique integer identifier for the nonterminal + */ + int addNonterminal(String nonterminal); + + /** + * Gets a unique integer identifier for the terminal. + * + * @param terminal Terminal symbol + * @return a unique integer identifier for the terminal + */ + int addTerminal(String terminal); + + /** + * Gets the unique integer identifiers for the words. + * + * @param words Array of symbols + * @return the unique integer identifiers for the words + */ + int[] addTerminals(String[] words); + + /** + * Gets the unique integer identifiers for the words + * in the sentence. + * + * @param sentence Space-delimited string of symbols + * @return the unique integer identifiers for the words + * in the sentence + */ + int[] addTerminals(String sentence); + + /** + * Gets an integer identifier for the word. + *

+ * If the word is in the vocabulary, the integer returned + * will uniquely identify that word. + *

+ * If the word is not in the vocabulary, the integer returned + * by getUnknownWordID may be returned. + * + * Alternatively, implementations may, if they choose, add + * unknown words and assign them a symbol ID instead of + * returning getUnknownWordID. + * + * @see #getUnknownWordID + * @return the unique integer identifier for wordString, + * or the result of getUnknownWordID + * if wordString is not in the vocabulary + * @param wordString the word to retrieve the integer identifier + */ + int getID(String wordString); + + /** + * Gets the integer identifiers for all words in the provided + * sentence. + *

+ * The sentence will be split (on spaces) into words, then + * the integer identifier for each word will be retrieved + * using getID. + * + * @see #getID(String) + * @param sentence String of words, separated by spaces. + * @return Array of integer identifiers for each word in + * the sentence + */ + int[] getIDs(String sentence); + + /** + * Gets the String that corresponds to the specified integer + * identifier. + *

+ * If the identifier is in the symbol vocabulary, the String + * returned will correspond to that identifier. + * + * Otherwise, the String returned by getUnknownWord + * will be returned. + * + * @param wordID an integer identifier for a specific String + * @return the String that corresponds to the specified + * integer identifier, or the result of + * getUnknownWord if the identifier + * does not correspond to a word in the vocabulary + */ + String getTerminal(int wordID); + + /** + * Gets the String that corresponds to the specified integer + * identifier. + *

+ * This method can be called for terminals or nonterminals. + * + * @param tokenID Integer identifier + * @return the String that corresponds to the specified + * integer identifier + */ + String getWord(int tokenID); + + /** + * Gets the String that corresponds to the sequence of + * specified integer identifiers. + * + * @param ids Sequence of integer identifiers + * @return the String that corresponds to the sequence of + * specified integer identifiers + */ + String getWords(int[] ids); + + /** + * + * @param wordIDs an int[] of identifiers for a specific Strings + * @return the String that corresponds to the specified + * integer identifiers + */ + String getTerminals(int[] wordIDs); + + /** + * Gets a collection over all symbol identifiers for the + * vocabulary. + * + * @return a collection over all symbol identifiers for the + * vocabulary + */ + Collection getAllIDs(); + + /** + * Gets the list of all words represented by this vocabulary. + * + * @return the list of all words represented by this + * vocabulary + */ + Collection getWords(); + + /** + * Gets the number of unique words in the vocabulary. + * + * @return the number of unique words in the vocabulary. + */ + int size(); + + /** + * Gets the integer symbol representation of the unknown + * word. + * + * @return the integer symbol representation of the unknown + * word. + */ + int getUnknownWordID(); + + /** + * Gets the string representation of the unknown word. + * + * @return the string representation of the unknown word. + */ + String getUnknownWord(); + + /** + * Returns true if the symbol id represents a + * nonterminal, false otherwise. + * + * @param id int symbol id + * @return true if the symbol id represents a + * nonterminal, false otherwise. + */ + boolean isNonterminal(int id); + + /** + * Gets the lowest-valued allowable terminal symbol id in + * this table. + * + * @return the lowest-valued allowable terminal symbol id + * in this table. + */ + int getLowestID(); + + + /** + * Gets the highest-valued allowable terminal symbol id in + * this table. + *

+ * NOTE: This may or may not return the same value as + * size. + * + * @return the highest-valued allowable terminal symbol id + * in this table. + */ + int getHighestID(); + + /** + * @param id todo + * @return todo + */ + int getTargetNonterminalIndex(int id);//first convert id to its String mapping, then call the function below + + /** + * @param word todo + * @return todo + */ + int getTargetNonterminalIndex(String word); + + /** + * @param wordIDs todo + * @param ntIndexIncrements todo + * @return todo + */ + String getWords(int[] wordIDs, boolean ntIndexIncrements); + +} \ No newline at end of file diff --git a/src/joshua/corpus/TerminalIterator.java b/src/main/java/org/apache/joshua/corpus/TerminalIterator.java similarity index 93% rename from src/joshua/corpus/TerminalIterator.java rename to src/main/java/org/apache/joshua/corpus/TerminalIterator.java index 29544fb5..e82b4cc5 100644 --- a/src/joshua/corpus/TerminalIterator.java +++ b/src/main/java/org/apache/joshua/corpus/TerminalIterator.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.corpus; +package org.apache.joshua.corpus; import java.util.Iterator; import java.util.NoSuchElementException; @@ -39,8 +39,7 @@ public class TerminalIterator implements Iterator { /** * Constructs an iterator for the terminals in the given list of words. * - * @param vocab - * @param words + * @param words array of words */ public TerminalIterator(int[] words) { this.words = words; @@ -75,7 +74,7 @@ public Integer next() { /** * Unsupported operation, guaranteed to throw an UnsupportedOperationException. * - * @throws UnsupportedOperationException + * @throws UnsupportedOperationException operation not supported yet! */ public void remove() { throw new UnsupportedOperationException(); diff --git a/src/joshua/corpus/Vocabulary.java b/src/main/java/org/apache/joshua/corpus/Vocabulary.java similarity index 84% rename from src/joshua/corpus/Vocabulary.java rename to src/main/java/org/apache/joshua/corpus/Vocabulary.java index d79170d6..2bcc447b 100644 --- a/src/joshua/corpus/Vocabulary.java +++ b/src/main/java/org/apache/joshua/corpus/Vocabulary.java @@ -16,25 +16,30 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.corpus; +package org.apache.joshua.corpus; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; +import java.io.Externalizable; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; +import java.io.ObjectInput; +import java.io.ObjectOutput; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.locks.StampedLock; -import joshua.decoder.Decoder; -import joshua.decoder.ff.lm.NGramLanguageModel; -import joshua.util.FormatUtils; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.ff.lm.NGramLanguageModel; +import org.apache.joshua.util.FormatUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Static singular vocabulary class. @@ -43,8 +48,9 @@ * @author Juri Ganitkevitch */ -public class Vocabulary { +public class Vocabulary implements Externalizable { + private static final Logger LOG = LoggerFactory.getLogger(Vocabulary.class); private final static ArrayList LMs = new ArrayList<>(); private static List idToString; @@ -80,15 +86,15 @@ public static boolean registerLanguageModel(NGramLanguageModel lm) { * Reads a vocabulary from file. This deletes any additions to the vocabulary made prior to * reading the file. * - * @param file_name + * @param vocab_file path to a vocabulary file * @return Returns true if vocabulary was read without mismatches or collisions. - * @throws IOException + * @throws IOException of the file cannot be found or read properly */ public static boolean read(final File vocab_file) throws IOException { DataInputStream vocab_stream = new DataInputStream(new BufferedInputStream(new FileInputStream(vocab_file))); int size = vocab_stream.readInt(); - Decoder.LOG(1, String.format("Read %d entries from the vocabulary", size)); + LOG.info("Read {} entries from the vocabulary", size); clear(); for (int i = 0; i < size; i++) { int id = vocab_stream.readInt(); @@ -109,7 +115,7 @@ public static void write(String file_name) throws IOException { DataOutputStream vocab_stream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file))); vocab_stream.writeInt(idToString.size() - 1); - Decoder.LOG(1, String.format("Writing vocabulary: %d tokens", idToString.size() - 1)); + LOG.info("Writing vocabulary: {} tokens", idToString.size() - 1); for (int i = 1; i < idToString.size(); i++) { vocab_stream.writeInt(i); vocab_stream.writeUTF(idToString.get(i)); @@ -125,9 +131,12 @@ public static void write(String file_name) throws IOException { * Get the id of the token if it already exists, new id is created otherwise. * * TODO: currently locks for every call. Separate constant (frozen) ids from - * changing (e.g. OOV) ids. Constant ids could be immutable -> no locking. + * changing (e.g. OOV) ids. Constant ids could be immutable -> no locking. * Alternatively: could we use ConcurrentHashMap to not have to lock if * actually contains it and only lock for modifications? + * + * @param token a token to obtain an id for + * @return the token id */ public static int id(String token) { // First attempt an optimistic read @@ -185,7 +194,7 @@ public static boolean hasId(int id) { public static int[] addAll(String sentence) { return addAll(sentence.split("\\s+")); } - + public static int[] addAll(String[] tokens) { int[] ids = new int[tokens.length]; for (int i = 0; i < tokens.length; i++) @@ -230,8 +239,8 @@ public static String getUnknownWord() { /** * Returns true if the Vocabulary ID represents a nonterminal. * - * @param id - * @return + * @param id vocabularly ID to check + * @return true if the Vocabulary ID represents a nonterminal */ public static boolean nt(int id) { return (id < 0); @@ -275,4 +284,26 @@ public static void unregisterLanguageModels() { LMs.clear(); } + @Override + public void writeExternal(ObjectOutput out) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void readExternal(ObjectInput in) + throws IOException, ClassNotFoundException { + // TODO Auto-generated method stub + + } + + @Override + public boolean equals(Object o) { + if(getClass() == o.getClass()) { + return true; + } else { + return false; + } + } + } diff --git a/src/joshua/corpus/syntax/ArraySyntaxTree.java b/src/main/java/org/apache/joshua/corpus/syntax/ArraySyntaxTree.java similarity index 98% rename from src/joshua/corpus/syntax/ArraySyntaxTree.java rename to src/main/java/org/apache/joshua/corpus/syntax/ArraySyntaxTree.java index d2a457a2..f3742798 100644 --- a/src/joshua/corpus/syntax/ArraySyntaxTree.java +++ b/src/main/java/org/apache/joshua/corpus/syntax/ArraySyntaxTree.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.corpus.syntax; +package org.apache.joshua.corpus.syntax; import java.io.Externalizable; import java.io.IOException; @@ -30,8 +30,8 @@ import java.util.Set; import java.util.Stack; -import joshua.corpus.Vocabulary; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.util.io.LineReader; public class ArraySyntaxTree implements SyntaxTree, Externalizable { @@ -280,17 +280,14 @@ public Collection getCcgLabels(int from, int to) { // TODO: bothersome no-backwards-arrays method. } } - return labels; } - @Override public int[] getTerminals() { return getTerminals(0, terminals.size()); } - @Override public int[] getTerminals(int from, int to) { int[] span = new int[to - from]; @@ -299,40 +296,32 @@ public int[] getTerminals(int from, int to) { return span; } - public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { // TODO Auto-generated method stub - } - public void writeExternal(ObjectOutput out) throws IOException { // TODO Auto-generated method stub - } - /** * Reads Penn Treebank format file + * @param file_name the string path of the Penn Treebank file + * @throws IOException if the file does not exist */ public void readExternalText(String file_name) throws IOException { LineReader reader = new LineReader(file_name); - initialize(); - for (String line : reader) { if (line.trim().equals("")) continue; appendFromPennFormat(line); } } - public void writeExternalText(String file_name) throws IOException { // TODO Auto-generated method stub - } - @Override public String toString() { StringBuilder sb = new StringBuilder(); diff --git a/src/joshua/corpus/syntax/SyntaxTree.java b/src/main/java/org/apache/joshua/corpus/syntax/SyntaxTree.java similarity index 96% rename from src/joshua/corpus/syntax/SyntaxTree.java rename to src/main/java/org/apache/joshua/corpus/syntax/SyntaxTree.java index bd318980..6bb4c0b0 100644 --- a/src/joshua/corpus/syntax/SyntaxTree.java +++ b/src/main/java/org/apache/joshua/corpus/syntax/SyntaxTree.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.corpus.syntax; +package org.apache.joshua.corpus.syntax; import java.util.Collection; diff --git a/src/joshua/decoder/ArgsParser.java b/src/main/java/org/apache/joshua/decoder/ArgsParser.java similarity index 78% rename from src/joshua/decoder/ArgsParser.java rename to src/main/java/org/apache/joshua/decoder/ArgsParser.java index 731bca12..2d98473f 100644 --- a/src/joshua/decoder/ArgsParser.java +++ b/src/main/java/org/apache/joshua/decoder/ArgsParser.java @@ -16,14 +16,16 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; -import joshua.util.io.LineReader; +import org.apache.joshua.util.io.LineReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * @author orluke @@ -31,14 +33,17 @@ */ public class ArgsParser { + private static final Logger LOG = LoggerFactory.getLogger(ArgsParser.class); + private String configFile = null; /** * Parse the arguments passed from the command line when the JoshuaDecoder application was * executed from the command line. * - * @param args - * @throws IOException + * @param args string array of input arguments + * @param joshuaConfiguration the {@link org.apache.joshua.decoder.JoshuaConfiguration} + * @throws IOException if there is an error wit the input arguments */ public ArgsParser(String[] args, JoshuaConfiguration joshuaConfiguration) throws IOException { @@ -59,8 +64,8 @@ public ArgsParser(String[] args, JoshuaConfiguration joshuaConfiguration) throws LineReader reader = new LineReader(String.format("%s/VERSION", System.getenv("JOSHUA"))); reader.readLine(); String version = reader.readLine().split("\\s+")[2]; - System.out.println(String.format("The Joshua machine translator, version %s", version)); - System.out.println("joshua-decoder.org"); + System.out.println(String.format("The Apache Joshua machine translator, version %s", version)); + System.out.println("joshua.incubator.apache.org"); System.exit(0); } else if (args[i].equals("-license")) { @@ -71,7 +76,7 @@ public ArgsParser(String[] args, JoshuaConfiguration joshuaConfiguration) throws System.out.println(line); } } catch (IOException e) { - System.err.println("FATAL: missing license file!"); + throw new RuntimeException("FATAL: missing license file!", e); } System.exit(0); } @@ -83,19 +88,17 @@ public ArgsParser(String[] args, JoshuaConfiguration joshuaConfiguration) throws setConfigFile(args[i + 1].trim()); try { - Decoder.LOG(1, "Parameters read from configuration file:"); + LOG.info("Parameters read from configuration file: {}", getConfigFile()); joshuaConfiguration.readConfigFile(getConfigFile()); } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + throw new RuntimeException(e); } - break; } } // Now process all the command-line args - Decoder.LOG(1, "Parameters overridden from the command line:"); + LOG.info("Parameters overridden from the command line:"); joshuaConfiguration.processCommandLineOptions(args); } } diff --git a/src/joshua/decoder/BLEU.java b/src/main/java/org/apache/joshua/decoder/BLEU.java similarity index 95% rename from src/joshua/decoder/BLEU.java rename to src/main/java/org/apache/joshua/decoder/BLEU.java index 1b3e3f81..8b514032 100644 --- a/src/joshua/decoder/BLEU.java +++ b/src/main/java/org/apache/joshua/decoder/BLEU.java @@ -16,26 +16,25 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; import java.util.ArrayList; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.state_maintenance.NgramDPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HyperEdge; -import joshua.util.Ngram; -import joshua.util.Regex; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HyperEdge; +import org.apache.joshua.util.Ngram; +import org.apache.joshua.util.Regex; /** * this class implements: (1) sentence-level bleu, with smoothing * - * @author Zhifei Li, + * @author Zhifei Li, zhifei.work@gmail.com */ public class BLEU { // do_ngram_clip: consider global n-gram clip @@ -47,11 +46,12 @@ public static float computeSentenceBleu(String[] refSents, String hypSent) { // ====================multiple references /** * - * @param refSents - * @param hypSent + * @param refSents todo + * @param hypSent todo * @param doNgramClip Should usually be true * @param bleuOrder Should usually be 4 * @param useShortestRef Probably use false + * @return todo */ public static float computeSentenceBleu(String[] refSents, String hypSent, boolean doNgramClip, int bleuOrder, boolean useShortestRef) { @@ -92,6 +92,9 @@ public static float computeEffectiveLen(int[] refLens, boolean useShortestRef) { /** * words in the ngrams are using integer symbol ID + * @param refSents todo + * @param bleuOrder todo + * @return todo * */ public static HashMap constructMaxRefCountTable(String[] refSents, int bleuOrder) { @@ -111,6 +114,8 @@ public static HashMap constructMaxRefCountTable(String[] refSen /** * compute max_ref_count for each ngram in the reference sentences + * @param listRefNgramTbl todo + * @return todo * */ public static HashMap computeMaxRefCountTbl( List> listRefNgramTbl) { @@ -195,10 +200,7 @@ public static float computeSentenceBleu(int refLen, HashMap ref numNgramMatch[Regex.spaces.split(ngram).length - 1] += Support.findMin( refNgramTbl.get(ngram), entry.getValue()); // ngram clip } else { - numNgramMatch[Regex.spaces.split(ngram).length - 1] += entry.getValue();// without - // ngram - // count - // clipping + numNgramMatch[Regex.spaces.split(ngram).length - 1] += entry.getValue();// without ngram count clipping } } } @@ -213,8 +215,7 @@ public static float computeSentenceBleu(int refLen, HashMap ref // sentence-bleu: BLEU= bp * prec; where prec = exp (sum 1/4 * log(prec[order])) public static float computeBleu(int hypLen, float refLen, int[] numNgramMatch, int bleuOrder) { if (hypLen <= 0 || refLen <= 0) { - System.out.println("error: ref or hyp is zero len"); - System.exit(1); + throw new RuntimeException("error: ref or hyp is zero len"); } float res = 0; float wt = 1.0f / bleuOrder; @@ -257,6 +258,11 @@ public static float computeLinearCorpusGain(float[] linearCorpusGainThetas, Stri /** * speed consideration: assume hypNgramTable has a smaller size than referenceNgramTable does + * @param linearCorpusGainThetas todo + * @param hypLength todo + * @param hypNgramTable todo + * @param referenceNgramTable todo + * @return todo */ public static float computeLinearCorpusGain(float[] linearCorpusGainThetas, int hypLength, Map hypNgramTable, Map referenceNgramTable) { @@ -332,8 +338,10 @@ static public float[] computeLinearCorpusThetas(int numUnigramTokens, float unig return res; } + public static final int maxOrder = 4; + /** - * Computes BLEU statistics incurred by a rule. This is (a) all ngram (n <= 4) for terminal rules + * Computes BLEU statistics incurred by a rule. This is (a) all ngram (n <= 4) for terminal rules * and (b) all ngrams overlying boundary points between terminals in the rule and ngram state from * tail nodes. * @@ -347,13 +355,11 @@ static public float[] computeLinearCorpusThetas(int numUnigramTokens, float unig * * Of these, all but the first have a boundary point to consider. * - * @param rule the rule being applied - * @param spanWidth the width of the span in the input sentence + * @param edge todo + * @param spanPct todo * @param references the reference to compute statistics against - * @return + * @return todo */ - public static final int maxOrder = 4; - public static Stats compute(HyperEdge edge, float spanPct, References references) { Stats stats = new Stats(); // TODO: this should not be the span width, but the real ref scaled to the span percentage @@ -376,10 +382,9 @@ public static Stats compute(HyperEdge edge, float spanPct, References references try { ngramState = (NgramDPState) edge.getTailNodes().get(tailIndex).getDPState(0); } catch (ClassCastException e) { - System.err.println(String.format( + throw new RuntimeException(String.format( "* FATAL: first state needs to be NgramDPState (found %s)", edge.getTailNodes() .get(tailIndex).getDPState(0).getClass())); - System.exit(1); } // Compute ngrams overlapping with left context of tail node diff --git a/src/joshua/decoder/Decoder.java b/src/main/java/org/apache/joshua/decoder/Decoder.java similarity index 85% rename from src/joshua/decoder/Decoder.java rename to src/main/java/org/apache/joshua/decoder/Decoder.java index 0057f87f..d13bf423 100644 --- a/src/joshua/decoder/Decoder.java +++ b/src/main/java/org/apache/joshua/decoder/Decoder.java @@ -16,11 +16,11 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; -import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES; +import static org.apache.joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES; -import java.io.BufferedWriter; +import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.io.OutputStream; @@ -28,7 +28,6 @@ import java.io.FileWriter; import java.lang.reflect.Constructor; import java.util.ArrayList; -import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -37,32 +36,34 @@ import com.google.common.base.Strings; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.JoshuaConfiguration.INPUT_TYPE; -import joshua.decoder.JoshuaConfiguration.SERVER_TYPE; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.PhraseModel; -import joshua.decoder.ff.StatefulFF; -import joshua.decoder.ff.lm.LanguageModelFF; -import joshua.decoder.ff.tm.Grammar; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.Trie; -import joshua.decoder.ff.tm.format.HieroFormatReader; -import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; -import joshua.decoder.ff.tm.packed.PackedGrammar; -import joshua.decoder.io.JSONMessage; -import joshua.decoder.io.TranslationRequestStream; -import joshua.decoder.phrase.PhraseTable; -import joshua.decoder.segment_file.Sentence; -import joshua.util.FileUtility; -import joshua.util.FormatUtils; -import joshua.util.Regex; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.JoshuaConfiguration.INPUT_TYPE; +import org.apache.joshua.decoder.JoshuaConfiguration.SERVER_TYPE; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.PhraseModel; +import org.apache.joshua.decoder.ff.StatefulFF; +import org.apache.joshua.decoder.ff.lm.LanguageModelFF; +import org.apache.joshua.decoder.ff.tm.Grammar; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.Trie; +import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader; +import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; +import org.apache.joshua.decoder.ff.tm.packed.PackedGrammar; +import org.apache.joshua.decoder.io.JSONMessage; +import org.apache.joshua.decoder.io.TranslationRequestStream; +import org.apache.joshua.decoder.phrase.PhraseTable; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.joshua.util.FileUtility; +import org.apache.joshua.util.FormatUtils; +import org.apache.joshua.util.Regex; +import org.apache.joshua.util.io.LineReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class handles decoder initialization and the complication introduced by multithreading. - * + * * After initialization, the main entry point to the Decoder object is * decodeAll(TranslationRequest), which returns a set of Translation objects wrapped in an iterable * Translations object. It is important that we support multithreading both (a) across the sentences @@ -74,19 +75,21 @@ * parallelization by separating out reading the input stream from processing the translated sentences, * but also ensures that round-robin parallelization occurs, since RequestParallelizer uses the * thread pool before translating each request. - * + * * A decoding thread is handled by DecoderThread and launched from DecoderThreadRunner. The purpose * of the runner is to record where to place the translated sentence when it is done (i.e., which * Translations object). Translations itself is an iterator whose next() call blocks until the next * translation is available. - * - * @author Matt Post - * @author Zhifei Li, - * @author wren ng thornton - * @author Lane Schwartz + * + * @author Matt Post post@cs.jhu.edu + * @author Zhifei Li, zhifei.work@gmail.com + * @author wren ng thornton wren@users.sourceforge.net + * @author Lane Schwartz dowobeha@users.sourceforge.net */ public class Decoder { + private static final Logger LOG = LoggerFactory.getLogger(Decoder.class); + private final JoshuaConfiguration joshuaConfiguration; public JoshuaConfiguration getJoshuaConfiguration() { @@ -116,8 +119,9 @@ public JoshuaConfiguration getJoshuaConfiguration() { /** * Constructor method that creates a new decoder using the specified configuration file. - * - * @param configFile Name of configuration file. + * + * @param joshuaConfiguration a populated {@link org.apache.joshua.decoder.JoshuaConfiguration} + * @param configFile name of configuration file. */ public Decoder(JoshuaConfiguration joshuaConfiguration, String configFile) { this(joshuaConfiguration); @@ -126,8 +130,9 @@ public Decoder(JoshuaConfiguration joshuaConfiguration, String configFile) { /** * Factory method that creates a new decoder using the specified configuration file. - * + * * @param configFile Name of configuration file. + * @return a configured {@link org.apache.joshua.decoder.Decoder} */ public static Decoder createDecoder(String configFile) { JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration(); @@ -154,6 +159,8 @@ private Decoder(JoshuaConfiguration joshuaConfiguration) { *

* This method is called by unit tests or any outside packages (e.g., MERT) relying on the * decoder. + * @param joshuaConfiguration a {@link org.apache.joshua.decoder.JoshuaConfiguration} object + * @return an uninitialized decoder for use in testing */ static public Decoder getUninitalizedDecoder(JoshuaConfiguration joshuaConfiguration) { return new Decoder(joshuaConfiguration); @@ -168,9 +175,9 @@ static public Decoder getUninitalizedDecoder(JoshuaConfiguration joshuaConfigura * DecoderThreadRunner to translate it. Each call to decodeAll(TranslationRequest) launches a * thread that will read the request's sentences, obtain a DecoderThread to translate them, and * then place the Translation in the appropriate place. - * + * * @author Matt Post - * + * */ private class RequestParallelizer extends Thread { /* Source of sentences to translate. */ @@ -178,12 +185,12 @@ private class RequestParallelizer extends Thread { /* Where to put translated sentences. */ private final Translations response; - + /* Sometimes we need to communicate with the client even when we didn't get a new sentence * (e.g., metadata) */ private OutputStream out; - + RequestParallelizer(TranslationRequestStream request, Translations response, OutputStream out) { this.request = request; this.response = response; @@ -202,7 +209,7 @@ public void run() { Sentence sentence = null; try { sentence = request.next(); - + } catch (MetaDataException meta) { try { handleMetadata(meta); @@ -212,7 +219,7 @@ public void run() { continue; } - + if (sentence == null) { response.finish(); break; @@ -227,7 +234,7 @@ public void run() { /** * When metadata is found on the input, it needs to be processed. That is done here. Sometimes * this involves returning data to the client. - * + * * @param meta * @throws IOException */ @@ -236,32 +243,31 @@ private void handleMetadata(MetaDataException meta) throws IOException { // Change a decoder weight String[] tokens = meta.tokens(); if (tokens.length != 3) { - System.err.println("* Error: weight change requires three tokens"); + LOG.error("weight change requires three tokens"); } else { float old_weight = Decoder.weights.getWeight(tokens[1]); Decoder.weights.set(tokens[1], Float.parseFloat(tokens[2])); - System.err.println(String.format("@set_weight: %s %.3f -> %.3f", - tokens[1], old_weight, - Decoder.weights.getWeight(tokens[1]))); + LOG.error("@set_weight: {} {} -> {}", tokens[1], old_weight, + Decoder.weights.getWeight(tokens[1])); } - + // TODO: return a JSON object with this weight or all weights out.write("".getBytes()); } else if (meta.type().equals("get_weight")) { // TODO: add to JSON object, send back - + String[] tokens = meta.tokens(); - - System.err.println(String.format("%s = %f", tokens[1], Decoder.weights.getWeight(tokens[1]))); + + LOG.error("{} = {}", tokens[1], Decoder.weights.getWeight(tokens[1])); out.write("".getBytes()); - + } else if (meta.type().equals("add_rule")) { String tokens[] = meta.tokens(" \\|\\|\\| "); if (tokens.length != 2) { - System.err.println("* INVALID RULE '" + meta.tokenString() + "'");; + LOG.error("* INVALID RULE '{}'", meta); out.write("bad rule".getBytes()); return; } @@ -270,22 +276,22 @@ private void handleMetadata(MetaDataException meta) throws IOException { String.format("[X] ||| [X,1] %s ||| [X,1] %s ||| custom=1", tokens[0], tokens[1])); Decoder.this.customPhraseTable.addRule(rule); rule.estimateRuleCost(featureFunctions); - Decoder.LOG(1, String.format("Added custom rule %s", formatRule(rule))); - + LOG.info("Added custom rule {}", formatRule(rule)); + String response = String.format("Added rule %s", formatRule(rule)); out.write(response.getBytes()); } else if (meta.type().equals("list_rules")) { - + JSONMessage message = new JSONMessage(); - + // Walk the the grammar trie ArrayList nodes = new ArrayList(); nodes.add(customPhraseTable.getTrieRoot()); - + while (nodes.size() > 0) { Trie trie = nodes.remove(0); - + if (trie == null) continue; @@ -298,9 +304,9 @@ private void handleMetadata(MetaDataException meta) throws IOException { if (trie.getExtensions() != null) nodes.addAll(trie.getExtensions()); } - + out.write(message.toString().getBytes()); - + } else if (meta.type().equals("remove_rule")) { // Remove a rule from a custom grammar, if present String[] tokens = meta.tokenString().split(" \\|\\|\\| "); @@ -325,7 +331,7 @@ private void handleMetadata(MetaDataException meta) throws IOException { for (Rule rule: trie.getRuleCollection().getRules()) { String target = rule.getEnglishWords(); target = target.substring(target.indexOf(' ') + 1); - + if (tokens[1].equals(target)) { matched = rule; break; @@ -335,14 +341,14 @@ private void handleMetadata(MetaDataException meta) throws IOException { out.write(String.format("Removed rule %s", formatRule(matched)).getBytes()); return; } - + out.write(String.format("No such rule %s", meta.tokenString()).getBytes()); } } /** * Strips the nonterminals from the lefthand side of the rule. - * + * * @param rule * @return */ @@ -354,7 +360,7 @@ private String formatRule(Rule rule) { ruleString += " " + Vocabulary.word(word); first = false; } - + ruleString += " |||"; // space will get added with first English word first = true; for (int word: rule.getEnglish()) { @@ -371,7 +377,7 @@ private String formatRule(Rule rule) { /** * Retrieve a thread from the thread pool, blocking until one is available. The blocking occurs in * a fair fashion (i.e,. FIFO across requests). - * + * * @return a thread that can be used for decoding. */ public DecoderThread getThread() { @@ -389,11 +395,11 @@ public DecoderThread getThread() { * input Sentence, returning a Translation object when its done). This is done in a thread so as * not to tie up the RequestHandler that launched it, freeing it to go on to the next sentence in * the TranslationRequest, in turn permitting parallelization across the sentences of a request. - * + * * When the decoder thread is finshed, the Translation object is placed in the correct place in * the corresponding Translations object that was returned to the caller of * Decoder.decodeAll(TranslationRequest). - * + * * @author Matt Post */ private class DecoderThreadRunner extends Thread { @@ -424,11 +430,9 @@ public void run() { */ threadPool.put(decoderThread); } catch (Exception e) { - System.err.println(String.format( - "Input %d: FATAL UNCAUGHT EXCEPTION: %s", sentence.id(), e.getMessage())); - e.printStackTrace(); - System.exit(1);; -// translations.record(new Translation(sentence, null, featureFunctions, joshuaConfiguration)); + throw new RuntimeException(String.format( + "Input %d: FATAL UNCAUGHT EXCEPTION: %s", sentence.id(), e.getMessage()), e); + // translations.record(new Translation(sentence, null, featureFunctions, joshuaConfiguration)); } } } @@ -437,22 +441,22 @@ public void run() { * This function is the main entry point into the decoder. It translates all the sentences in a * (possibly boundless) set of input sentences. Each request launches its own thread to read the * sentences of the request. - * - * @param request - * @return an iterable set of Translation objects - * @throws IOException + * + * @param request the populated {@link org.apache.joshua.decoder.io.TranslationRequestStream} + * @param out an appropriate {@link java.io.OutputStream} to write results to + * @throws IOException if there is an error with the input stream or writing the output */ public void decodeAll(TranslationRequestStream request, OutputStream out) throws IOException { Translations translations = new Translations(request); /* Start a thread to handle requests on the input stream */ new RequestParallelizer(request, translations, out).start(); - + // Create the n-best output stream FileWriter nbest_out = null; if (joshuaConfiguration.n_best_file != null) nbest_out = new FileWriter(joshuaConfiguration.n_best_file); - + for (;;) { Translation translation = translations.next(); if (translation == null) @@ -461,7 +465,7 @@ public void decodeAll(TranslationRequestStream request, OutputStream out) throws if (joshuaConfiguration.input_type == INPUT_TYPE.json || joshuaConfiguration.server_type == SERVER_TYPE.HTTP) { JSONMessage message = JSONMessage.buildMessage(translation); out.write(message.toString().getBytes()); - + } else { /** * We need to munge the feature value outputs in order to be compatible with Moses tuners. @@ -475,12 +479,12 @@ public void decodeAll(TranslationRequestStream request, OutputStream out) throws // Write the complete formatted string to STDOUT if (joshuaConfiguration.n_best_file != null) nbest_out.write(text); - + // Extract just the translation and output that to STDOUT text = text.substring(0, text.indexOf('\n')); String[] fields = text.split(" \\|\\|\\| "); text = fields[1] + "\n"; - + } else { text = translation.toString(); } @@ -489,7 +493,7 @@ public void decodeAll(TranslationRequestStream request, OutputStream out) throws } out.flush(); } - + if (joshuaConfiguration.n_best_file != null) nbest_out.close(); } @@ -497,9 +501,9 @@ public void decodeAll(TranslationRequestStream request, OutputStream out) throws /** * We can also just decode a single sentence. - * - * @param sentence - * @return The translated sentence + * + * @param sentence {@link org.apache.joshua.lattice.Lattice} input + * @return the sentence {@link org.apache.joshua.decoder.Translation} */ public Translation decode(Sentence sentence) { // Get a thread. @@ -534,7 +538,7 @@ public void cleanUp() { } resetGlobalState(); } - + public static void resetGlobalState() { // clear/reset static variables DENSE_FEATURE_NAMES.clear(); @@ -570,7 +574,7 @@ public static void writeConfigFile(double[] newWeights, String template, String if (newDiscriminativeModel != null && "discriminative".equals(fds[0])) { newSent.append(fds[0]).append(' '); newSent.append(newDiscriminativeModel).append(' ');// change the - // file name + // file name for (int i = 2; i < fds.length - 1; i++) { newSent.append(fds[i]).append(' '); } @@ -610,7 +614,7 @@ public static void writeConfigFile(double[] newWeights, String template, String * Moses requires the pattern .*_.* for sparse features, and prohibits underscores in dense features. * This conforms to that pattern. We assume non-conforming dense features start with tm_ or lm_, * and the only sparse feature that needs converting is OOVPenalty. - * + * * @param feature * @return the feature in Moses format */ @@ -619,13 +623,13 @@ private String mosesize(String feature) { if (feature.startsWith("tm_") || feature.startsWith("lm_")) return feature.replace("_", "-"); } - + return feature; } - + /** * Initialize all parts of the JoshuaDecoder. - * + * * @param configFile File containing configuration options * @return An initialized decoder */ @@ -646,12 +650,12 @@ public Decoder initialize(String configFile) { for (int i = 0; i < tokens.length; i += 2) { String feature = tokens[i]; float value = Float.parseFloat(tokens[i+1]); - + if (joshuaConfiguration.moses) feature = demoses(feature); - + joshuaConfiguration.weights.add(String.format("%s %s", feature, tokens[i+1])); - Decoder.LOG(1, String.format("COMMAND LINE WEIGHT: %s -> %.3f", feature, value)); + LOG.info("COMMAND LINE WEIGHT: {} -> {}", feature, value); } } @@ -661,21 +665,19 @@ public Decoder initialize(String configFile) { /* Sanity check for old-style unsupported feature invocations. */ if (pair.length != 2) { - System.err.println("FATAL: Invalid feature weight line found in config file."); - System.err - .println(String.format("The line was '%s'", pairStr)); - System.err - .println("You might be using an old version of the config file that is no longer supported"); - System.err - .println("Check joshua-decoder.org or email joshua_support@googlegroups.com for help"); - System.exit(17); + StringBuilder errMsg = new StringBuilder(); + errMsg.append("FATAL: Invalid feature weight line found in config file.\n"); + errMsg.append(String.format("The line was '%s'\n", pairStr)); + errMsg.append("You might be using an old version of the config file that is no longer supported\n"); + errMsg.append("Check joshua-decoder.org or email joshua_support@googlegroups.com for help\n"); + errMsg.append("Code = " + 17); + throw new RuntimeException(errMsg.toString()); } weights.set(pair[0], Float.parseFloat(pair[1])); } - Decoder.LOG(1, String.format("Read %d weights (%d of them dense)", weights.size(), - DENSE_FEATURE_NAMES.size())); + LOG.info("Read {} weights ({} of them dense)", weights.size(), DENSE_FEATURE_NAMES.size()); // Do this before loading the grammars and the LM. this.featureFunctions = new ArrayList(); @@ -683,9 +685,8 @@ public Decoder initialize(String configFile) { // Initialize and load grammars. This must happen first, since the vocab gets defined by // the packed grammar (if any) this.initializeTranslationGrammars(); - - Decoder.LOG(1, String.format("Grammar loading took: %d seconds.", - (System.currentTimeMillis() - pre_load_time) / 1000)); + LOG.info("Grammar loading took: {} seconds.", + (System.currentTimeMillis() - pre_load_time) / 1000); // Initialize the features: requires that LM model has been initialized. this.initializeFeatureFunctions(); @@ -694,24 +695,24 @@ public Decoder initialize(String configFile) { if (joshuaConfiguration.show_weights_and_quit) { for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) { String name = DENSE_FEATURE_NAMES.get(i); - if (joshuaConfiguration.moses) + if (joshuaConfiguration.moses) System.out.println(String.format("%s= %.5f", mosesize(name), weights.getDense(i))); else System.out.println(String.format("%s %.5f", name, weights.getDense(i))); } System.exit(0); } - + // Sort the TM grammars (needed to do cube pruning) if (joshuaConfiguration.amortized_sorting) { - Decoder.LOG(1, "Grammar sorting happening lazily on-demand."); + LOG.info("Grammar sorting happening lazily on-demand."); } else { long pre_sort_time = System.currentTimeMillis(); for (Grammar grammar : this.grammars) { grammar.sortGrammar(this.featureFunctions); } - Decoder.LOG(1, String.format("Grammar sorting took %d seconds.", - (System.currentTimeMillis() - pre_sort_time) / 1000)); + LOG.info("Grammar sorting took {} seconds.", + (System.currentTimeMillis() - pre_sort_time) / 1000); } // Create the threads @@ -719,12 +720,8 @@ public Decoder initialize(String configFile) { this.threadPool.put(new DecoderThread(this.grammars, Decoder.weights, this.featureFunctions, joshuaConfiguration)); } - - } catch (IOException e) { - e.printStackTrace(); - } catch (InterruptedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + } catch (IOException | InterruptedException e) { + LOG.warn(e.getMessage(), e); } return this; @@ -732,7 +729,7 @@ public Decoder initialize(String configFile) { /** * Initializes translation grammars Retained for backward compatibility - * + * * @param ownersSeen Records which PhraseModelFF's have been instantiated (one is needed for each * owner) * @throws IOException @@ -763,24 +760,24 @@ private void initializeTranslationGrammars() throws IOException { packed_grammars.add(packed_grammar); grammar = packed_grammar; } catch (FileNotFoundException e) { - System.err.println(String.format("Couldn't load packed grammar from '%s'", path)); - System.err.println("Perhaps it doesn't exist, or it may be an old packed file format."); - System.exit(2); + String msg = String.format("Couldn't load packed grammar from '%s'", path) + + "Perhaps it doesn't exist, or it may be an old packed file format."; + throw new RuntimeException(e); } } else { // thrax, hiero, samt grammar = new MemoryBasedBatchGrammar(type, path, owner, joshuaConfiguration.default_non_terminal, span_limit, joshuaConfiguration); } - + } else { - int maxSourceLen = parsedArgs.containsKey("max-source-len") + int maxSourceLen = parsedArgs.containsKey("max-source-len") ? Integer.parseInt(parsedArgs.get("max-source-len")) : -1; joshuaConfiguration.search_algorithm = "stack"; - grammar = new PhraseTable(path, owner, type, joshuaConfiguration, maxSourceLen); + grammar = new PhraseTable(path, owner, type, joshuaConfiguration); } this.grammars.add(grammar); @@ -789,7 +786,7 @@ private void initializeTranslationGrammars() throws IOException { checkSharedVocabularyChecksumsForPackedGrammars(packed_grammars); } else { - Decoder.LOG(1, "* WARNING: no grammars supplied! Supplying dummy glue grammar."); + LOG.warn("no grammars supplied! Supplying dummy glue grammar."); MemoryBasedBatchGrammar glueGrammar = new MemoryBasedBatchGrammar("glue", joshuaConfiguration); glueGrammar.setSpanLimit(-1); glueGrammar.addGlueRules(featureFunctions); @@ -797,12 +794,12 @@ private void initializeTranslationGrammars() throws IOException { } /* Add the grammar for custom entries */ - this.customPhraseTable = new PhraseTable(null, "custom", "phrase", joshuaConfiguration, 0); + this.customPhraseTable = new PhraseTable(null, "custom", "phrase", joshuaConfiguration); this.grammars.add(this.customPhraseTable); /* Create an epsilon-deleting grammar */ if (joshuaConfiguration.lattice_decoding) { - Decoder.LOG(1, "Creating an epsilon-deleting grammar"); + LOG.info("Creating an epsilon-deleting grammar"); MemoryBasedBatchGrammar latticeGrammar = new MemoryBasedBatchGrammar("lattice", joshuaConfiguration); latticeGrammar.setSpanLimit(-1); HieroFormatReader reader = new HieroFormatReader(); @@ -810,6 +807,7 @@ private void initializeTranslationGrammars() throws IOException { String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol); String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal); + //FIXME: too many arguments String ruleString = String.format("[%s] ||| [%s,1] ||| [%s,1] ||| ", goalNT, goalNT, defaultNT, goalNT, defaultNT); @@ -831,11 +829,11 @@ private void initializeTranslationGrammars() throws IOException { ownersSeen.add(owner); } } - - Decoder.LOG(1, String.format("Memory used %.1f MB", - ((Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0))); + + LOG.info("Memory used {} MB", + ((Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0)); } - + /** * Checks if multiple packedGrammars have the same vocabulary by comparing their vocabulary file checksums. */ @@ -849,7 +847,7 @@ private static void checkSharedVocabularyChecksumsForPackedGrammars(final List

* feature_function = FEATURE OPTIONS * - * + * * Weights for features are listed separately. - * - * @param tmOwnersSeen + * * @throws IOException - * + * */ private void initializeFeatureFunctions() throws IOException { @@ -942,14 +933,13 @@ private void initializeFeatureFunctions() throws IOException { this.featureFunctions.add((FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration)); } catch (Exception e) { e.printStackTrace(); - System.err.println("* FATAL: could not find a feature '" + featureName + "'"); - System.exit(1); + throw new RuntimeException("* FATAL: could not find a feature '" + featureName + "'"); } } for (FeatureFunction feature : featureFunctions) { - Decoder.LOG(1, String.format("FEATURE: %s", feature.logString())); - + LOG.info("FEATURE: {}", feature.logString()); + } weights.registerDenseFeatures(featureFunctions); @@ -958,13 +948,14 @@ private void initializeFeatureFunctions() throws IOException { /** * Searches a list of predefined paths for classes, and returns the first one found. Meant for * instantiating feature functions. - * + * * @param name * @return the class, found in one of the search paths * @throws ClassNotFoundException */ private Class getClass(String featureName) { Class clas = null; + String[] packages = { "joshua.decoder.ff", "joshua.decoder.ff.lm", "joshua.decoder.ff.phrase" }; for (String path : packages) { try { @@ -981,13 +972,4 @@ private Class getClass(String featureName) { } return clas; } - - public static boolean VERBOSE(int i) { - return i <= VERBOSE; - } - - public static void LOG(int i, String msg) { - if (VERBOSE(i)) - System.err.println(msg); - } } diff --git a/src/joshua/decoder/DecoderThread.java b/src/main/java/org/apache/joshua/decoder/DecoderThread.java similarity index 74% rename from src/joshua/decoder/DecoderThread.java rename to src/main/java/org/apache/joshua/decoder/DecoderThread.java index 4e2a15cb..d6f5233a 100644 --- a/src/joshua/decoder/DecoderThread.java +++ b/src/main/java/org/apache/joshua/decoder/DecoderThread.java @@ -16,24 +16,25 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.logging.Logger; - -import joshua.decoder.chart_parser.Chart; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.SourceDependentFF; -import joshua.decoder.ff.tm.Grammar; -import joshua.decoder.hypergraph.ForestWalker; -import joshua.decoder.hypergraph.GrammarBuilderWalkerFunction; -import joshua.decoder.hypergraph.HyperGraph; -import joshua.decoder.phrase.Stacks; -import joshua.decoder.segment_file.Sentence; -import joshua.corpus.Vocabulary; + +import org.apache.joshua.decoder.chart_parser.Chart; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.SourceDependentFF; +import org.apache.joshua.decoder.ff.tm.Grammar; +import org.apache.joshua.decoder.hypergraph.ForestWalker; +import org.apache.joshua.decoder.hypergraph.GrammarBuilderWalkerFunction; +import org.apache.joshua.decoder.hypergraph.HyperGraph; +import org.apache.joshua.decoder.phrase.Stacks; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class handles decoding of individual Sentence objects (which can represent plain sentences @@ -44,11 +45,13 @@ * * The DecoderFactory class is responsible for launching the threads. * - * @author Matt Post - * @author Zhifei Li, + * @author Matt Post post@cs.jhu.edu + * @author Zhifei Li, zhifei.work@gmail.com */ public class DecoderThread extends Thread { + private static final Logger LOG = LoggerFactory.getLogger(DecoderThread.class); + private final JoshuaConfiguration joshuaConfiguration; /* * these variables may be the same across all threads (e.g., just copy from DecoderFactory), or @@ -57,7 +60,6 @@ public class DecoderThread extends Thread { private final List allGrammars; private final List featureFunctions; - private static final Logger logger = Logger.getLogger(DecoderThread.class.getName()); // =============================================================== // Constructor @@ -91,21 +93,22 @@ public void run() { * Translate a sentence. * * @param sentence The sentence to be translated. + * @return the sentence {@link org.apache.joshua.decoder.Translation} */ public Translation translate(Sentence sentence) { - Decoder.LOG(1, String.format("Input %d: %s", sentence.id(), sentence.fullSource())); + LOG.info("Input {}: {}", sentence.id(), sentence.fullSource()); if (sentence.target() != null) - Decoder.LOG(1, String.format("Input %d: Constraining to target sentence '%s'", - sentence.id(), sentence.target())); + LOG.info("Input {}: Constraining to target sentence '{}'", + sentence.id(), sentence.target()); // skip blank sentences if (sentence.isEmpty()) { - Decoder.LOG(1, String.format("Translation %d: Translation took 0 seconds", sentence.id())); + LOG.info("Translation {}: Translation took 0 seconds", sentence.id()); return new Translation(sentence, null, featureFunctions, joshuaConfiguration); } - + long startTime = System.currentTimeMillis(); int numGrammars = allGrammars.size(); @@ -113,7 +116,7 @@ public Translation translate(Sentence sentence) { for (int i = 0; i < allGrammars.size(); i++) grammars[i] = allGrammars.get(i); - + if (joshuaConfiguration.segment_oovs) sentence.segmentOOVs(grammars); @@ -127,7 +130,7 @@ public Translation translate(Sentence sentence) { if (joshuaConfiguration.search_algorithm.equals("stack")) { Stacks stacks = new Stacks(sentence, this.featureFunctions, grammars, joshuaConfiguration); - + hypergraph = stacks.search(); } else { /* Seeding: the chart only sees the grammars, not the factories */ @@ -135,19 +138,19 @@ public Translation translate(Sentence sentence) { joshuaConfiguration.goal_symbol, joshuaConfiguration); hypergraph = (joshuaConfiguration.use_dot_chart) - ? chart.expand() - : chart.expandSansDotChart(); + ? chart.expand() + : chart.expandSansDotChart(); } - + } catch (java.lang.OutOfMemoryError e) { - Decoder.LOG(1, String.format("Input %d: out of memory", sentence.id())); + LOG.error("Input {}: out of memory", sentence.id()); hypergraph = null; } float seconds = (System.currentTimeMillis() - startTime) / 1000.0f; - Decoder.LOG(1, String.format("Input %d: Translation took %.3f seconds", sentence.id(), seconds)); - Decoder.LOG(1, String.format("Input %d: Memory used is %.1f MB", sentence.id(), (Runtime - .getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0)); + LOG.info("Input {}: Translation took {} seconds", sentence.id(), seconds); + LOG.info("Input {}: Memory used is {} MB", sentence.id(), (Runtime + .getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0); /* Return the translation unless we're doing synchronous parsing. */ if (!joshuaConfiguration.parse || hypergraph == null) { @@ -155,7 +158,7 @@ public Translation translate(Sentence sentence) { } /*****************************************************************************************/ - + /* * Synchronous parsing. * @@ -164,8 +167,8 @@ public Translation translate(Sentence sentence) { Grammar newGrammar = getGrammarFromHyperGraph(joshuaConfiguration.goal_symbol, hypergraph); newGrammar.sortGrammar(this.featureFunctions); long sortTime = System.currentTimeMillis(); - logger.info(String.format("Sentence %d: New grammar has %d rules.", sentence.id(), - newGrammar.getNumRules())); + LOG.info("Sentence {}: New grammar has {} rules.", sentence.id(), + newGrammar.getNumRules()); /* Step 2. Create a new chart and parse with the instantiated grammar. */ Grammar[] newGrammarArray = new Grammar[] { newGrammar }; @@ -173,20 +176,19 @@ public Translation translate(Sentence sentence) { Chart chart = new Chart(targetSentence, featureFunctions, newGrammarArray, "GOAL",joshuaConfiguration); int goalSymbol = GrammarBuilderWalkerFunction.goalSymbol(hypergraph); String goalSymbolString = Vocabulary.word(goalSymbol); - logger.info(String.format("Sentence %d: goal symbol is %s (%d).", sentence.id(), - goalSymbolString, goalSymbol)); + LOG.info("Sentence {}: goal symbol is {} ({}).", sentence.id(), + goalSymbolString, goalSymbol); chart.setGoalSymbolID(goalSymbol); /* Parsing */ HyperGraph englishParse = chart.expand(); long secondParseTime = System.currentTimeMillis(); - logger.info(String.format("Sentence %d: Finished second chart expansion (%d seconds).", - sentence.id(), (secondParseTime - sortTime) / 1000)); - logger.info(String.format("Sentence %d total time: %d seconds.\n", sentence.id(), - (secondParseTime - startTime) / 1000)); - logger.info(String.format("Memory used after sentence %d is %.1f MB", sentence.id(), (Runtime - .getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0)); - + LOG.info("Sentence {}: Finished second chart expansion ({} seconds).", + sentence.id(), (secondParseTime - sortTime) / 1000); + LOG.info("Sentence {} total time: {} seconds.\n", sentence.id(), + (secondParseTime - startTime) / 1000); + LOG.info("Memory used after sentence {} is {} MB", sentence.id(), (Runtime + .getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0); return new Translation(sentence, englishParse, featureFunctions, joshuaConfiguration); // or do something else } diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java similarity index 85% rename from src/joshua/decoder/JoshuaConfiguration.java rename to src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java index 7a3de23e..2bf8e375 100644 --- a/src/joshua/decoder/JoshuaConfiguration.java +++ b/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java @@ -16,10 +16,10 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; -import static joshua.util.FormatUtils.cleanNonTerminal; -import static joshua.util.FormatUtils.markup; +import static org.apache.joshua.util.FormatUtils.cleanNonTerminal; +import static org.apache.joshua.util.FormatUtils.markup; import java.io.File; import java.io.FileWriter; @@ -29,33 +29,36 @@ import java.io.FileReader; import java.util.ArrayList; import java.util.Collections; -import java.util.logging.Logger; -import joshua.decoder.ff.StatefulFF; -import joshua.decoder.ff.fragmentlm.Tree; -import joshua.util.FormatUtils; -import joshua.util.Regex; -import joshua.util.io.LineReader; +import org.apache.joshua.decoder.ff.StatefulFF; +import org.apache.joshua.decoder.ff.fragmentlm.Tree; +import org.apache.joshua.util.FormatUtils; +import org.apache.joshua.util.Regex; +import org.apache.joshua.util.io.LineReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Configuration file for Joshua decoder. - * + * * When adding new features to Joshua, any new configurable parameters should be added to this * class. - * - * @author Zhifei Li, - * @author Matt Post + * + * @author Zhifei Li, zhifei.work@gmail.com + * @author Matt Post post@cs.jhu.edu */ public class JoshuaConfiguration { - - // whether to construct a StructuredTranslation object for each request instead of + + private static final Logger LOG = LoggerFactory.getLogger(JoshuaConfiguration.class); + + // whether to construct a StructuredTranslation object for each request instead of // printing to stdout. Used when the Decoder is used from Java directly. public Boolean use_structured_output = false; - + // If set to true, Joshua will lowercase the input, creating an annotation that marks the // original case public boolean lowercase = false; - + // If set to true, Joshua will recapitalize the output by projecting the case from aligned // source-side words public boolean project_case = false; @@ -72,63 +75,63 @@ public class JoshuaConfiguration { * also just be listed in the main config file. */ public String weights_file = ""; - // Default symbols. The symbol here should be enclosed in square brackets. public String default_non_terminal = FormatUtils.markup("X"); + public String goal_symbol = FormatUtils.markup("GOAL"); /* * A list of OOV symbols in the form - * + * * [X1] weight [X2] weight [X3] weight ... - * + * * where the [X] symbols are nonterminals and the weights are weights. For each OOV word w in the * input sentence, Joshua will create rules of the form - * + * * X1 -> w (weight) - * + * * If this is empty, an unweighted default_non_terminal is used. */ - public class OOVItem implements Comparable { public String label; + public float weight; OOVItem(String l, float w) { label = l; weight = w; } - @Override public int compareTo(OOVItem other) { - if (weight > other.weight) + if (weight > other.weight) return -1; else if (weight < other.weight) return 1; return 0; } } + public ArrayList oovList = null; /* * Whether to segment OOVs into a lattice */ public boolean segment_oovs = false; - + /* * Enable lattice decoding. */ public boolean lattice_decoding = false; - + /* * If false, sorting of the complete grammar is done at load time. If true, grammar tries are not * sorted till they are first accessed. Amortized sorting means you get your first translation * much, much quicker (good for debugging), but that per-sentence decoding is a bit slower. */ public boolean amortized_sorting = true; - // syntax-constrained decoding public boolean constrain_parse = false; + public boolean use_pos_labels = false; // oov-specific @@ -154,21 +157,21 @@ else if (weight < other.weight) /* The number of hypotheses to output by default. */ public int topN = 1; - + /** * This string describes the format of each line of output from the decoder (i.e., the * translations). The string can include arbitrary text and also variables. The following * variables are available: - * + * *

-   * - %i the 0-indexed sentence number 
-   * - %e the source string %s the translated sentence 
-   * - %S the translated sentence with some basic capitalization and denormalization 
-   * - %t the synchronous derivation 
-   * - %f the list of feature values (as name=value pairs) 
+   * - %i the 0-indexed sentence number
+   * - %e the source string %s the translated sentence
+   * - %S the translated sentence with some basic capitalization and denormalization
+   * - %t the synchronous derivation
+   * - %f the list of feature values (as name=value pairs)
    * - %c the model cost
-   * - %w the weight vector 
-   * - %a the alignments between source and target words (currently unimplemented) 
+   * - %w the weight vector
+   * - %a the alignments between source and target words (currently unimplemented)
    * - %d a verbose, many-line version of the derivation
    * 
*/ @@ -189,7 +192,6 @@ else if (weight < other.weight) /* Enables synchronous parsing. */ public boolean parse = false; // perform synchronous parsing - private final Logger logger = Logger.getLogger(JoshuaConfiguration.class.getName()); /* A list of the feature functions. */ public ArrayList features = new ArrayList(); @@ -204,7 +206,7 @@ public enum INPUT_TYPE { plain, json }; /* Type of server. Not sure we need to keep the regular TCP one around. */ public enum SERVER_TYPE { none, TCP, HTTP }; public SERVER_TYPE server_type = SERVER_TYPE.TCP; - + /* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */ public int server_port = 0; @@ -240,10 +242,10 @@ public enum SERVER_TYPE { none, TCP, HTTP }; /* The search algorithm: currently either "cky" or "stack" */ public String search_algorithm = "cky"; - + /* The distortion limit */ public int reordering_limit = 8; - + /* The number of target sides considered for each source side (after sorting by model weight) */ public int num_translation_options = 20; @@ -251,16 +253,16 @@ public enum SERVER_TYPE { none, TCP, HTTP }; * version of Sennrich (SSST 2014) */ public boolean use_dot_chart = true; - + /* Moses compatibility */ public boolean moses = false; - + /* If true, just print out the weights found in the config file, and exit. */ public boolean show_weights_and_quit = false; - + /* Read input from a file (Moses compatible flag) */ public String input_file = null; - + /* Write n-best output to this file */ public String n_best_file = null; @@ -269,27 +271,27 @@ public enum SERVER_TYPE { none, TCP, HTTP }; /* Weights overridden from the command line */ public String weight_overwrite = ""; - + /** * This method resets the state of JoshuaConfiguration back to the state after initialization. * This is useful when for example making different calls to the decoder within the same java * program, which otherwise leads to potential errors due to inconsistent state as a result of * loading the configuration multiple times without resetting etc. - * + * * This leads to the insight that in fact it may be an even better idea to refactor the code and * make JoshuaConfiguration an object that is is created and passed as an argument, rather than a * shared static object. This is just a suggestion for the next step. - * + * */ public void reset() { - logger.info("Resetting the JoshuaConfiguration to its defaults ..."); - logger.info("\n\tResetting the StatefullFF global state index ..."); - logger.info("\n\t...done"); + LOG.info("Resetting the JoshuaConfiguration to its defaults ..."); + LOG.info("\n\tResetting the StatefullFF global state index ..."); + LOG.info("\n\t...done"); StatefulFF.resetGlobalStateIndex(); tms = new ArrayList(); weights_file = ""; default_non_terminal = "[X]"; - oovList = new ArrayList(); + oovList = new ArrayList(); oovList.add(new OOVItem(default_non_terminal, 1.0f)); goal_symbol = "[GOAL]"; amortized_sorting = true; @@ -311,10 +313,10 @@ public void reset() { features = new ArrayList(); weights = new ArrayList(); server_port = 0; - + reordering_limit = 8; num_translation_options = 20; - logger.info("...done"); + LOG.info("...done"); } // =============================================================== @@ -325,6 +327,8 @@ public void reset() { * To process command-line options, we write them to a file that looks like the config file, and * then call readConfigFile() on it. It would be more general to define a class that sits on a * stream and knows how to chop it up, but this was quicker to implement. + * + * @param options string array of command line options */ public void processCommandLineOptions(String[] options) { try { @@ -353,8 +357,7 @@ public void processCommandLineOptions(String[] options) { tmpFile.delete(); } catch (IOException e) { - e.printStackTrace(); - System.exit(1); + throw new RuntimeException(e); } } @@ -364,7 +367,7 @@ public void readConfigFile(String configFile) throws IOException { try { for (String line : configReader) { line = line.trim(); // .toLowerCase(); - + if (Regex.commentOrEmptyLine.matches(line)) continue; @@ -377,7 +380,7 @@ public void readConfigFile(String configFile) throws IOException { if (line.indexOf("=") != -1) { // parameters; (not feature function) String[] fds = Regex.equalsWithSpaces.split(line, 2); if (fds.length < 2) { - Decoder.LOG(1, String.format("* WARNING: skipping config file line '%s'", line)); + LOG.warn("skipping config file line '{}'", line); continue; } @@ -398,7 +401,7 @@ public void readConfigFile(String configFile) throws IOException { * * feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz */ - + String[] tokens = fds[1].split("\\s+"); if (tokens[2].equals("true")) features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s", @@ -414,25 +417,24 @@ public void readConfigFile(String configFile) throws IOException { * tm = TYPE -owner OWNER -maxspan MAXSPAN -path PATH */ String tmLine = fds[1]; - + String[] tokens = fds[1].split("\\s+"); if (! tokens[1].startsWith("-")) { // old format tmLine = String.format("%s -owner %s -maxspan %s -path %s", tokens[0], tokens[1], tokens[2], tokens[3]); - Decoder.LOG(1, String.format("WARNING: Converting deprecated TM line from '%s' -> '%s'", fds[1], tmLine)); + LOG.warn("Converting deprecated TM line from '{}' -> '{}'", fds[1], tmLine); } tms.add(tmLine); - + } else if (parameter.equals("v")) { Decoder.VERBOSE = Integer.parseInt(fds[1]); } else if (parameter.equals(normalize_key("parse"))) { parse = Boolean.parseBoolean(fds[1]); - logger.finest(String.format("parse: %s", parse)); + LOG.debug("parse: {}", parse); } else if (parameter.equals(normalize_key("dump-hypergraph"))) { hypergraphFilePattern = fds[1].trim(); - logger - .finest(String.format(" hypergraph dump file format: %s", hypergraphFilePattern)); + LOG.debug(" hypergraph dump file format: {}", hypergraphFilePattern); } else if (parameter.equals(normalize_key("oov-list"))) { if (new File(fds[1]).exists()) { @@ -446,7 +448,7 @@ public void readConfigFile(String configFile) throws IOException { String[] tokens = str.trim().split("\\s+"); oovList.add(new OOVItem(FormatUtils.markup(tokens[0]), - (float) Math.log(Float.parseFloat(tokens[1])))); + (float) Math.log(Float.parseFloat(tokens[1])))); str = br.readLine(); } @@ -462,10 +464,8 @@ public void readConfigFile(String configFile) throws IOException { } else { String[] tokens = fds[1].trim().split("\\s+"); if (tokens.length % 2 != 0) { - System.err.println(String.format("* FATAL: invalid format for '%s'", fds[0])); - System.exit(1); - } - + throw new RuntimeException(String.format("* FATAL: invalid format for '%s'", fds[0])); + } oovList = new ArrayList(); for (int i = 0; i < tokens.length; i += 2) @@ -477,18 +477,18 @@ public void readConfigFile(String configFile) throws IOException { } else if (parameter.equals(normalize_key("lattice-decoding"))) { lattice_decoding = true; - + } else if (parameter.equals(normalize_key("segment-oovs"))) { segment_oovs = true; lattice_decoding = true; } else if (parameter.equals(normalize_key("default-non-terminal"))) { default_non_terminal = markup(cleanNonTerminal(fds[1].trim())); - logger.finest(String.format("default_non_terminal: %s", default_non_terminal)); + LOG.debug("default_non_terminal: {}", default_non_terminal); } else if (parameter.equals(normalize_key("goal-symbol"))) { goal_symbol = markup(cleanNonTerminal(fds[1].trim())); - logger.finest("goalSymbol: " + goal_symbol); + LOG.debug("goalSymbol: {}", goal_symbol); } else if (parameter.equals(normalize_key("weights-file"))) { weights_file = fds[1]; @@ -510,19 +510,19 @@ public void readConfigFile(String configFile) throws IOException { } else if (parameter.equals(normalize_key("use_unique_nbest"))) { use_unique_nbest = Boolean.valueOf(fds[1]); - logger.finest(String.format("use_unique_nbest: %s", use_unique_nbest)); + LOG.debug("use_unique_nbest: {}", use_unique_nbest); } else if (parameter.equals(normalize_key("output-format"))) { outputFormat = fds[1]; - logger.finest(String.format("output-format: %s", outputFormat)); + LOG.debug("output-format: {}", outputFormat); } else if (parameter.equals(normalize_key("include_align_index"))) { include_align_index = Boolean.valueOf(fds[1]); - logger.finest(String.format("include_align_index: %s", include_align_index)); + LOG.debug("include_align_index: {}", include_align_index); } else if (parameter.equals(normalize_key("top_n"))) { topN = Integer.parseInt(fds[1]); - logger.finest(String.format("topN: %s", topN)); + LOG.debug("topN: {}", topN); } else if (parameter.equals(normalize_key("num_parallel_decoders")) || parameter.equals(normalize_key("threads"))) { @@ -531,26 +531,25 @@ public void readConfigFile(String configFile) throws IOException { throw new IllegalArgumentException( "Must specify a positive number for num_parallel_decoders"); } - logger.finest(String.format("num_parallel_decoders: %s", num_parallel_decoders)); + LOG.debug("num_parallel_decoders: {}", num_parallel_decoders); } else if (parameter.equals(normalize_key("mark_oovs"))) { mark_oovs = Boolean.valueOf(fds[1]); - logger.finest(String.format("mark_oovs: %s", mark_oovs)); + LOG.debug("mark_oovs: {}", mark_oovs); } else if (parameter.equals(normalize_key("pop-limit"))) { pop_limit = Integer.parseInt(fds[1]); - logger.finest(String.format("pop-limit: %s", pop_limit)); + LOG.info("pop-limit: {}", pop_limit); } else if (parameter.equals(normalize_key("input-type"))) { - if (fds[1].equals("json")) + if (fds[1].equals("json")) { input_type = INPUT_TYPE.json; - else if (fds[1].equals("plain")) + } else if (fds[1].equals("plain")) { input_type = INPUT_TYPE.plain; - else { - System.err.println(String.format("* FATAL: invalid server type '%s'", fds[1])); - System.exit(1); + } else { + throw new RuntimeException(String.format("* FATAL: invalid server type '%s'", fds[1])); } - logger.info(String.format(" input-type: %s", input_type)); + LOG.info(" input-type: {}", input_type); } else if (parameter.equals(normalize_key("server-type"))) { if (fds[1].toLowerCase().equals("tcp")) @@ -558,19 +557,19 @@ else if (fds[1].equals("plain")) else if (fds[1].toLowerCase().equals("http")) server_type = SERVER_TYPE.HTTP; - logger.info(String.format(" server-type: %s", server_type)); - + LOG.info(" server-type: {}", server_type); + } else if (parameter.equals(normalize_key("server-port"))) { server_port = Integer.parseInt(fds[1]); - logger.info(String.format(" server-port: %d", server_port)); + LOG.info(" server-port: {}", server_port); } else if (parameter.equals(normalize_key("rescore-forest"))) { rescoreForest = true; - logger.info(String.format(" rescore-forest: %s", rescoreForest)); + LOG.info(" rescore-forest: {}", rescoreForest); } else if (parameter.equals(normalize_key("rescore-forest-weight"))) { rescoreForestWeight = Float.parseFloat(fds[1]); - logger.info(String.format(" rescore-forest-weight: %f", rescoreForestWeight)); + LOG.info(" rescore-forest-weight: {}", rescoreForestWeight); } else if (parameter.equals(normalize_key("maxlen"))) { // reset the maximum length @@ -591,22 +590,22 @@ else if (fds[1].toLowerCase().equals("http")) } else if (parameter .equals(normalize_key(SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME))) { fuzzy_matching = Boolean.parseBoolean(fds[1]); - logger.finest(String.format(fuzzy_matching + ": %s", fuzzy_matching)); + LOG.debug("fuzzy_matching: {}", fuzzy_matching); } else if (parameter.equals(normalize_key("fragment-map"))) { fragmentMapFile = fds[1]; Tree.readMapping(fragmentMapFile); - /** PHRASE-BASED PARAMETERS **/ + /** PHRASE-BASED PARAMETERS **/ } else if (parameter.equals(normalize_key("search"))) { search_algorithm = fds[1]; - + if (!search_algorithm.equals("cky") && !search_algorithm.equals("stack")) { throw new RuntimeException( "-search must be one of 'stack' (for phrase-based decoding) " + - "or 'cky' (for hierarchical / syntactic decoding)"); + "or 'cky' (for hierarchical / syntactic decoding)"); } - + if (search_algorithm.equals("cky") && include_align_index) { throw new RuntimeException( "include_align_index is currently not supported with cky search"); @@ -617,13 +616,13 @@ else if (fds[1].toLowerCase().equals("http")) } else if (parameter.equals(normalize_key("num-translation-options"))) { num_translation_options = Integer.parseInt(fds[1]); - + } else if (parameter.equals(normalize_key("no-dot-chart"))) { use_dot_chart = false; - + } else if (parameter.equals(normalize_key("moses"))) { moses = true; // triggers some Moses-specific compatibility options - + } else if (parameter.equals(normalize_key("show-weights"))) { show_weights_and_quit = true; @@ -637,23 +636,23 @@ else if (fds[1].toLowerCase().equals("http")) } else if (parameter.equals(normalize_key("input-file"))) { // for Moses compatibility input_file = fds[1]; - + } else if (parameter.equals(normalize_key("weight-file"))) { // for Moses, ignore } else if (parameter.equals(normalize_key("weight-overwrite"))) { weight_overwrite = fds[1]; - + } else if (parameter.equals(normalize_key("source-annotations"))) { // Check source sentence source_annotations = true; } else if (parameter.equals(normalize_key("cached-rules-size"))) { - // Check source sentence - cachedRuleSize = Integer.parseInt(fds[1]); + // Check source sentence + cachedRuleSize = Integer.parseInt(fds[1]); } else if (parameter.equals(normalize_key("lowercase"))) { lowercase = true; - + } else if (parameter.equals(normalize_key("project-case"))) { project_case = true; @@ -666,15 +665,14 @@ else if (fds[1].toLowerCase().equals("http")) || parameter.equals(normalize_key("useCubePrune")) || parameter.equals(normalize_key("useBeamAndThresholdPrune")) || parameter.equals(normalize_key("regexp-grammar"))) { - logger.warning(String.format("WARNING: ignoring deprecated parameter '%s'", fds[0])); + LOG.warn("ignoring deprecated parameter '{}'", fds[0]); } else { - logger.warning("FATAL: unknown configuration parameter '" + fds[0] + "'"); - System.exit(1); + throw new RuntimeException("FATAL: unknown configuration parameter '" + fds[0] + "'"); } } - Decoder.LOG(1, String.format(" %s = '%s'", normalize_key(fds[0]), fds[1])); + LOG.info(" {} = '{}'", normalize_key(fds[0]), fds[1]); } else { /* @@ -701,8 +699,13 @@ public void sanityCheck() { * equivalence classes on external use of parameter names, permitting arbitrary_under_scores and * camelCasing in paramter names without forcing the user to memorize them all. Here are some * examples of equivalent ways to refer to parameter names: - * + *
    * {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
+   * 
+ * + * @param text the string to be normalized + * @return normalized key + * */ public static String normalize_key(String text) { return text.replaceAll("[-_]", "").toLowerCase(); diff --git a/src/joshua/decoder/JoshuaDecoder.java b/src/main/java/org/apache/joshua/decoder/JoshuaDecoder.java similarity index 68% rename from src/joshua/decoder/JoshuaDecoder.java rename to src/main/java/org/apache/joshua/decoder/JoshuaDecoder.java index 841f517d..951f5c7e 100644 --- a/src/joshua/decoder/JoshuaDecoder.java +++ b/src/main/java/org/apache/joshua/decoder/JoshuaDecoder.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; import java.io.BufferedReader; import java.io.FileInputStream; @@ -26,27 +26,28 @@ import java.io.InputStreamReader; import java.io.PrintStream; import java.net.InetSocketAddress; -import java.util.logging.Logger; import com.sun.net.httpserver.HttpServer; -import joshua.decoder.JoshuaConfiguration.SERVER_TYPE; -import joshua.decoder.io.TranslationRequestStream; -import joshua.server.TcpServer; -import joshua.server.ServerThread; +import org.apache.joshua.decoder.JoshuaConfiguration.SERVER_TYPE; +import org.apache.joshua.decoder.io.TranslationRequestStream; +import org.apache.joshua.server.TcpServer; +import org.apache.joshua.server.ServerThread; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Implements decoder initialization, including interaction with JoshuaConfiguration * and DecoderThread. * - * @author Zhifei Li, - * @author wren ng thornton - * @author Lane Schwartz + * @author Zhifei Li, zhifei.work@gmail.com + * @author wren ng thornton wren@users.sourceforge.net + * @author Lane Schwartz dowobeha@users.sourceforge.net */ public class JoshuaDecoder { - private static final Logger logger = Logger.getLogger(JoshuaDecoder.class.getName()); - + private static final Logger LOG = LoggerFactory.getLogger(JoshuaDecoder.class); + // =============================================================== // Main // =============================================================== @@ -55,12 +56,6 @@ public static void main(String[] args) throws IOException { JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration(); ArgsParser userArgs = new ArgsParser(args,joshuaConfiguration); - String logFile = System.getenv().get("JOSHUA") + "/logging.properties"; - try { - java.util.logging.LogManager.getLogManager().readConfiguration(new FileInputStream(logFile)); - } catch (IOException e) { - logger.warning("Couldn't initialize logging properties from '" + logFile + "'"); - } long startTime = System.currentTimeMillis(); @@ -70,10 +65,9 @@ public static void main(String[] args) throws IOException { /* Step-1: initialize the decoder, test-set independent */ Decoder decoder = new Decoder(joshuaConfiguration, userArgs.getConfigFile()); - Decoder.LOG(1, String.format("Model loading took %d seconds", - (System.currentTimeMillis() - startTime) / 1000)); - Decoder.LOG(1, String.format("Memory used %.1f MB", ((Runtime.getRuntime().totalMemory() - Runtime - .getRuntime().freeMemory()) / 1000000.0))); + LOG.info("Model loading took {} seconds", (System.currentTimeMillis() - startTime) / 1000); + LOG.info("Memory used {} MB", ((Runtime.getRuntime().totalMemory() + - Runtime.getRuntime().freeMemory()) / 1000000.0)); /* Step-2: Decoding */ // create a server if requested, which will create TranslationRequest objects @@ -84,12 +78,12 @@ public static void main(String[] args) throws IOException { } else if (joshuaConfiguration.server_type == SERVER_TYPE.HTTP) { HttpServer server = HttpServer.create(new InetSocketAddress(port), 0); - Decoder.LOG(1, String.format("** HTTP Server running and listening on port %d.", port)); + LOG.info("HTTP Server running and listening on port {}.", port); server.createContext("/", new ServerThread(null, decoder, joshuaConfiguration)); server.setExecutor(null); // creates a default executor server.start(); } else { - System.err.println("* FATAL: unknown server type"); + LOG.error("Unknown server type"); System.exit(1); } return; @@ -112,13 +106,12 @@ public static void main(String[] args) throws IOException { if (joshuaConfiguration.n_best_file != null) out.close(); - Decoder.LOG(1, "Decoding completed."); - Decoder.LOG(1, String.format("Memory used %.1f MB", ((Runtime.getRuntime().totalMemory() - Runtime - .getRuntime().freeMemory()) / 1000000.0))); + LOG.info("Decoding completed."); + LOG.info("Memory used {} MB", ((Runtime.getRuntime().totalMemory() + - Runtime.getRuntime().freeMemory()) / 1000000.0)); /* Step-3: clean up */ decoder.cleanUp(); - Decoder.LOG(1, String.format("Total running time: %d seconds", - (System.currentTimeMillis() - startTime) / 1000)); + LOG.info("Total running time: {} seconds", (System.currentTimeMillis() - startTime) / 1000); } } diff --git a/src/joshua/decoder/MetaDataException.java b/src/main/java/org/apache/joshua/decoder/MetaDataException.java similarity index 97% rename from src/joshua/decoder/MetaDataException.java rename to src/main/java/org/apache/joshua/decoder/MetaDataException.java index 932059ce..394891a7 100644 --- a/src/joshua/decoder/MetaDataException.java +++ b/src/main/java/org/apache/joshua/decoder/MetaDataException.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; /* * This class is used to capture metadata command to Joshua on input and pass them to the diff --git a/src/joshua/decoder/NbestMinRiskReranker.java b/src/main/java/org/apache/joshua/decoder/NbestMinRiskReranker.java similarity index 93% rename from src/joshua/decoder/NbestMinRiskReranker.java rename to src/main/java/org/apache/joshua/decoder/NbestMinRiskReranker.java index 9596ae00..9f63cad6 100644 --- a/src/joshua/decoder/NbestMinRiskReranker.java +++ b/src/main/java/org/apache/joshua/decoder/NbestMinRiskReranker.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; import java.io.IOException; import java.util.ArrayList; @@ -29,9 +29,10 @@ import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.TimeUnit; -import joshua.util.Ngram; -import joshua.util.Regex; - +import org.apache.joshua.util.Ngram; +import org.apache.joshua.util.Regex; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * this class implements: (1) nbest min risk (MBR) reranking using BLEU as a gain funtion. @@ -42,10 +43,12 @@ * uses a Viterbi approximation: the probability of a string is its best derivation probability So, * if one want to deal with spurious ambiguity, he/she should do that before calling this class * - * @author Zhifei Li, + * @author Zhifei Li, zhifei.work@gmail.com */ public class NbestMinRiskReranker { + private static final Logger LOG = LoggerFactory.getLogger(NbestMinRiskReranker.class); + // TODO: this functionality is not implemented yet; default is to produce 1best without any // feature scores; boolean produceRerankedNbest = false; @@ -67,7 +70,7 @@ public NbestMinRiskReranker(boolean produceRerankedNbest, double scalingFactor) public String processOneSent(List nbest, int sentID) { - System.err.println("Now process sentence " + sentID); + LOG.info("Now process sentence {}", sentID); // step-0: preprocess // assumption: each hyp has a formate: @@ -77,7 +80,7 @@ public String processOneSent(List nbest, int sentID) { if (nbest.size() == 1) { String[] fields = Regex.threeBarsWithSpace.split(nbest.get(0)); if (fields[1].equals("") || Regex.spaces.matches(fields[1])) { - System.err.println(String.format("-> sentence is empty")); + LOG.warn("-> sentence is empty"); return ""; } } @@ -171,7 +174,7 @@ public String processOneSent(List nbest, int sentID) { */ } - System.err.println("best gain: " + bestGain); + LOG.info("best gain: {}", bestGain); if (null == bestHyp) { throw new RuntimeException("mbr reranked one best is null, must be wrong"); } @@ -182,7 +185,10 @@ public String processOneSent(List nbest, int sentID) { /** * based on a list of log-probabilities in nbestLogProbs, obtain a normalized distribution, and * put the normalized probability (real value in [0,1]) into nbestLogProbs - * */ + * + * @param nbestLogProbs a {@link java.util.List} of {@link java.lang.Double} representing nbestLogProbs + * @param scalingFactor double value representing scaling factor + */ // get a normalized distributeion and put it back to nbestLogProbs static public void computeNormalizedProbs(List nbestLogProbs, double scalingFactor) { @@ -311,8 +317,10 @@ public static void main(String[] args) throws IOException { // If you don't know what to use for scaling factor, try using 1 if (args.length < 2) { - System.err - .println("usage: java NbestMinRiskReranker [numThreads]"); + String msg = "usage: java NbestMinRiskReranker " + + "[numThreads]"; + System.err.println(msg); + LOG.error(msg); return; } long startTime = System.currentTimeMillis(); @@ -324,7 +332,7 @@ public static void main(String[] args) throws IOException { NbestMinRiskReranker mbrReranker = new NbestMinRiskReranker(produceRerankedNbest, scalingFactor); - System.err.println("##############running mbr reranking"); + LOG.info("Running mbr reranking"); int oldSentID = -1; List nbest = new ArrayList(); @@ -390,18 +398,15 @@ public static void main(String[] args) throws IOException { String best_hyp = result.toString(); System.out.println(best_hyp); } - - } catch (InterruptedException e) { - e.printStackTrace(); + LOG.error(e.getMessage(), e); } - } scanner.close(); - System.err.println("Total running time (seconds) is " - + (System.currentTimeMillis() - startTime) / 1000.0); + LOG.info("Total running time (seconds) is {} ", + (System.currentTimeMillis() - startTime) / 1000.0); } private class RankerTask implements Runnable { diff --git a/src/joshua/decoder/StructuredTranslation.java b/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java similarity index 80% rename from src/joshua/decoder/StructuredTranslation.java rename to src/main/java/org/apache/joshua/decoder/StructuredTranslation.java index 7b2185f9..8aa518ef 100644 --- a/src/joshua/decoder/StructuredTranslation.java +++ b/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java @@ -16,27 +16,26 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; import static java.util.Arrays.asList; import static java.util.Collections.emptyList; -import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures; -import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString; -import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignmentList; -import static joshua.util.FormatUtils.removeSentenceMarkers; +import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures; +import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString; +import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignmentList; +import static org.apache.joshua.util.FormatUtils.removeSentenceMarkers; import java.util.List; import java.util.Map; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.hypergraph.HyperGraph; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.hypergraph.HyperGraph; +import org.apache.joshua.decoder.segment_file.Sentence; /** - * structuredTranslation provides a more structured access to translation + *

structuredTranslation provides a more structured access to translation * results than the Translation class. - * Members of instances of this class can be used upstream. - *
+ * Members of instances of this class can be used upstream.

* TODO: * Enable K-Best extraction. * @@ -107,6 +106,7 @@ public float getTranslationScore() { /** * Returns a list of target to source alignments. + * @return a list of target to source alignments */ public List> getTranslationWordAlignments() { return translationWordAlignments; @@ -118,6 +118,7 @@ public Map getTranslationFeatures() { /** * Time taken to build output information from the hypergraph. + * @return the time taken to build output information from the hypergraph */ public Float getExtractionTime() { return extractionTime; diff --git a/src/joshua/decoder/Support.java b/src/main/java/org/apache/joshua/decoder/Support.java similarity index 92% rename from src/joshua/decoder/Support.java rename to src/main/java/org/apache/joshua/decoder/Support.java index af33ec5c..e513aef8 100644 --- a/src/joshua/decoder/Support.java +++ b/src/main/java/org/apache/joshua/decoder/Support.java @@ -16,12 +16,12 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; import java.util.List; /** - * @author Zhifei Li, + * @author Zhifei Li, zhifei.work@gmail.com */ public class Support { @@ -33,14 +33,15 @@ public static double findMax(double a, double b) { return (a > b) ? a : b; } - public static int[] toArray(List in) { return subIntArray(in, 0, in.size()); } /** + * @param in a {@link java.util.List} of Integer * @param start inclusive * @param end exclusive + * @return sub int[] from start to end */ public static int[] subIntArray(List in, int start, int end) { int[] res = new int[end - start]; diff --git a/src/joshua/decoder/Translation.java b/src/main/java/org/apache/joshua/decoder/Translation.java similarity index 75% rename from src/joshua/decoder/Translation.java rename to src/main/java/org/apache/joshua/decoder/Translation.java index 8004d9f6..ab37814a 100644 --- a/src/joshua/decoder/Translation.java +++ b/src/main/java/org/apache/joshua/decoder/Translation.java @@ -16,35 +16,38 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; -import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures; -import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString; -import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignments; -import static joshua.util.FormatUtils.removeSentenceMarkers; +import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures; +import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString; +import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignments; +import static org.apache.joshua.util.FormatUtils.removeSentenceMarkers; import java.io.BufferedWriter; import java.io.IOException; import java.io.StringWriter; import java.util.List; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.lm.StateMinimizingLanguageModel; -import joshua.decoder.hypergraph.HyperGraph; -import joshua.decoder.hypergraph.KBestExtractor; -import joshua.decoder.io.DeNormalize; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.lm.StateMinimizingLanguageModel; +import org.apache.joshua.decoder.hypergraph.HyperGraph; +import org.apache.joshua.decoder.hypergraph.KBestExtractor; +import org.apache.joshua.decoder.io.DeNormalize; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class represents translated input objects (sentences or lattices). It is aware of the source * sentence and id and contains the decoded hypergraph. Translation objects are returned by * DecoderThread instances to the InputHandler, where they are assembled in order for output. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class Translation { + private static final Logger LOG = LoggerFactory.getLogger(Translation.class); private Sentence source; /** @@ -54,17 +57,17 @@ public class Translation { private String output = null; private StructuredTranslation structuredTranslation = null; - + public Translation(Sentence source, HyperGraph hypergraph, List featureFunctions, JoshuaConfiguration joshuaConfiguration) { this.source = source; - + if (joshuaConfiguration.use_structured_output) { - + structuredTranslation = new StructuredTranslation( source, hypergraph, featureFunctions); this.output = structuredTranslation.getTranslationString(); - + } else { StringWriter sw = new StringWriter(); @@ -81,15 +84,14 @@ public Translation(Sentence source, HyperGraph hypergraph, // We must put this weight as zero, otherwise we get an error when we try to retrieve it // without checking Decoder.weights.increment("BLEU", 0); - + if (joshuaConfiguration.topN == 0) { - + /* construct Viterbi output */ final String best = getViterbiString(hypergraph); - - Decoder.LOG(1, String.format("Translation %d: %.3f %s", source.id(), hypergraph.goalNode.getScore(), - best)); - + + LOG.info("Translation {}: {} {}", source.id(), hypergraph.goalNode.getScore(), best); + /* * Setting topN to 0 turns off k-best extraction, in which case we need to parse through * the output-string, with the understanding that we can only substitute variables for the @@ -100,21 +102,21 @@ public Translation(Sentence source, HyperGraph hypergraph, .replace("%S", DeNormalize.processSingleLine(best)) .replace("%c", String.format("%.3f", hypergraph.goalNode.getScore())) .replace("%i", String.format("%d", source.id())); - + if (joshuaConfiguration.outputFormat.contains("%a")) { translation = translation.replace("%a", getViterbiWordAlignments(hypergraph)); } - + if (joshuaConfiguration.outputFormat.contains("%f")) { final FeatureVector features = getViterbiFeatures(hypergraph, featureFunctions, source); translation = translation.replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString()); } - + out.write(translation); out.newLine(); - + } else { - + final KBestExtractor kBestExtractor = new KBestExtractor( source, featureFunctions, Decoder.weights, false, joshuaConfiguration); kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out); @@ -129,35 +131,34 @@ public Translation(Sentence source, HyperGraph hypergraph, } float seconds = (float) (System.currentTimeMillis() - startTime) / 1000.0f; - Decoder.LOG(1, String.format("Input %d: %d-best extraction took %.3f seconds", id(), - joshuaConfiguration.topN, seconds)); - - } else { - - // Failed translations and blank lines get empty formatted outputs - // @formatter:off - String outputString = joshuaConfiguration.outputFormat - .replace("%s", source.source()) - .replace("%e", "") - .replace("%S", "") - .replace("%t", "()") - .replace("%i", Integer.toString(source.id())) - .replace("%f", "") - .replace("%c", "0.000"); - // @formatter:on - - out.write(outputString); - out.newLine(); - } + LOG.info("Input {}: {}-best extraction took {} seconds", id(), + joshuaConfiguration.topN, seconds); + + } else { + + // Failed translations and blank lines get empty formatted outputs + // @formatter:off + String outputString = joshuaConfiguration.outputFormat + .replace("%s", source.source()) + .replace("%e", "") + .replace("%S", "") + .replace("%t", "()") + .replace("%i", Integer.toString(source.id())) + .replace("%f", "") + .replace("%c", "0.000"); + // @formatter:on + + out.write(outputString); + out.newLine(); + } out.flush(); } catch (IOException e) { - e.printStackTrace(); - System.exit(1); + throw new RuntimeException(e); } - + this.output = sw.toString(); - + } /* @@ -170,7 +171,7 @@ public Translation(Sentence source, HyperGraph hypergraph, break; } } - + } public Sentence getSourceSentence() { @@ -185,12 +186,12 @@ public int id() { public String toString() { return output; } - + /** * Returns the StructuredTranslation object * if JoshuaConfiguration.construct_structured_output == True. * @throws RuntimeException if StructuredTranslation object not set. - * @return + * @return {@link org.apache.joshua.decoder.StructuredTranslation} object */ public StructuredTranslation getStructuredTranslation() { if (structuredTranslation == null) { @@ -198,5 +199,5 @@ public StructuredTranslation getStructuredTranslation() { } return structuredTranslation; } - + } diff --git a/src/joshua/decoder/Translations.java b/src/main/java/org/apache/joshua/decoder/Translations.java similarity index 94% rename from src/joshua/decoder/Translations.java rename to src/main/java/org/apache/joshua/decoder/Translations.java index e6ba9e64..0b91ff90 100644 --- a/src/joshua/decoder/Translations.java +++ b/src/main/java/org/apache/joshua/decoder/Translations.java @@ -16,10 +16,10 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder; +package org.apache.joshua.decoder; import java.util.LinkedList; -import joshua.decoder.io.TranslationRequestStream; +import org.apache.joshua.decoder.io.TranslationRequestStream; /** * This class represents a streaming sequence of translations. It is returned by the main entry @@ -30,7 +30,7 @@ * Translation in the right place. When the next translation in a sequence is available, next() is * notified. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class Translations { @@ -73,7 +73,7 @@ public void finish() { * the ID of the translation is the same as the one being waited for (currentID). If so, the * thread waiting for it is notified. * - * @param translation + * @param translation a translated input object */ public void record(Translation translation) { synchronized (this) { @@ -98,6 +98,8 @@ public void record(Translation translation) { /** * Returns the next Translation, blocking if necessary until it's available, since the next * Translation might not have been produced yet. + * + * @return first element from the list of {@link org.apache.joshua.decoder.Translation}'s */ public Translation next() { synchronized (this) { diff --git a/src/joshua/decoder/chart_parser/Cell.java b/src/main/java/org/apache/joshua/decoder/chart_parser/Cell.java similarity index 95% rename from src/joshua/decoder/chart_parser/Cell.java rename to src/main/java/org/apache/joshua/decoder/chart_parser/Cell.java index d8d16d81..10b9200f 100644 --- a/src/joshua/decoder/chart_parser/Cell.java +++ b/src/main/java/org/apache/joshua/decoder/chart_parser/Cell.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.chart_parser; +package org.apache.joshua.decoder.chart_parser; import static com.google.common.base.Preconditions.checkNotNull; @@ -28,13 +28,14 @@ import java.util.Map; import java.util.Set; import java.util.Map.Entry; -import java.util.logging.Logger; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.hypergraph.HyperEdge; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.hypergraph.HyperEdge; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * this class implement functions: (1) combine small itesm into larger ones using rules, and create @@ -46,6 +47,12 @@ */ class Cell { + // =============================================================== + // Static fields + // =============================================================== + private static final Logger LOG = LoggerFactory.getLogger(Cell.class); + + // The chart this cell belongs to private Chart chart = null; @@ -63,10 +70,6 @@ class Cell { */ private List sortedNodes = null; - // =============================================================== - // Static fields - // =============================================================== - private static final Logger logger = Logger.getLogger(Cell.class.getName()); // =============================================================== // Constructor @@ -131,7 +134,7 @@ boolean transitToGoal(Cell bin, List featureFunctions, int sent int itemsInGoalBin = getSortedNodes().size(); if (1 != itemsInGoalBin) { - logger.severe("the goal_bin does not have exactly one item"); + LOG.error("the goal_bin does not have exactly one item"); return false; } diff --git a/src/joshua/decoder/chart_parser/Chart.java b/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java similarity index 91% rename from src/joshua/decoder/chart_parser/Chart.java rename to src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java index b10c0134..184ae27e 100644 --- a/src/joshua/decoder/chart_parser/Chart.java +++ b/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java @@ -16,37 +16,35 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.chart_parser; +package org.apache.joshua.decoder.chart_parser; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.PriorityQueue; -import java.util.logging.Level; -import java.util.logging.Logger; - -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.CubePruneState; -import joshua.decoder.chart_parser.DotChart.DotNode; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.SourceDependentFF; -import joshua.decoder.ff.tm.AbstractGrammar; -import joshua.decoder.ff.tm.Grammar; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.RuleCollection; -import joshua.decoder.ff.tm.Trie; -import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.hypergraph.HyperGraph; -import joshua.decoder.segment_file.Sentence; -import joshua.decoder.segment_file.Token; -import joshua.lattice.Arc; -import joshua.lattice.Lattice; -import joshua.lattice.Node; -import joshua.util.ChartSpan; + +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.DotChart.DotNode; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.SourceDependentFF; +import org.apache.joshua.decoder.ff.tm.AbstractGrammar; +import org.apache.joshua.decoder.ff.tm.Grammar; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.RuleCollection; +import org.apache.joshua.decoder.ff.tm.Trie; +import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.hypergraph.HyperGraph; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.segment_file.Token; +import org.apache.joshua.lattice.Arc; +import org.apache.joshua.lattice.Lattice; +import org.apache.joshua.lattice.Node; +import org.apache.joshua.util.ChartSpan; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Chart class this class implements chart-parsing: (1) seeding the chart (2) @@ -60,12 +58,13 @@ * index of sentences: start from zero index of cell: cell (i,j) represent span * of words indexed [i,j-1] where i is in [0,n-1] and j is in [1,n] * - * @author Zhifei Li, - * @author Matt Post + * @author Zhifei Li, zhifei.work@gmail.com + * @author Matt Post post@cs.jhu.edu */ public class Chart { + private static final Logger LOG = LoggerFactory.getLogger(Chart.class); private final JoshuaConfiguration config; // =========================================================== // Statistics @@ -100,7 +99,6 @@ public Sentence getSentence() { // private ManualConstraintsHandler manualConstraintsHandler; private StateConstraint stateConstraint; - private static final Logger logger = Logger.getLogger(Chart.class.getName()); // =============================================================== // Constructors @@ -169,7 +167,7 @@ public Chart(Sentence sentence, List featureFunctions, Grammar[ if (ff instanceof SourceDependentFF) ((SourceDependentFF) ff).setSource(sentence); - Decoder.LOG(2, "Finished seeding chart."); + LOG.debug("Finished seeding chart."); } /** @@ -434,8 +432,8 @@ public HyperGraph expandSansDotChart() { if (null == this.cells.get(0, sourceLength) || !this.goalBin.transitToGoal(this.cells.get(0, sourceLength), this.featureFunctions, this.sourceLength)) { - Decoder.LOG(1, String.format("Input %d: Parse failure (either no derivations exist or pruning is too aggressive", - sentence.id())); + LOG.info("Input {}: Parse failure (either no derivations exist or pruning is too aggressive", + sentence.id()); return null; } @@ -565,8 +563,7 @@ public HyperGraph expand() { for (int width = 1; width <= sourceLength; width++) { for (int i = 0; i <= sourceLength - width; i++) { int j = i + width; - if (logger.isLoggable(Level.FINEST)) - logger.finest(String.format("Processing span (%d, %d)", i, j)); + LOG.debug("Processing span ({}, {})", i, j); /* Skips spans for which no path exists (possible in lattices). */ if (inputLattice.distance(i, j) == Float.POSITIVE_INFINITY) { @@ -578,7 +575,7 @@ public HyperGraph expand() { * rules over (i,j-1) that need the terminal at (j-1,j) and looking at * all split points k to expand nonterminals. */ - logger.finest("Expanding cell"); + LOG.debug("Expanding cell"); for (int k = 0; k < this.grammars.length; k++) { /** * Each dotChart can act individually (without consulting other @@ -592,17 +589,17 @@ public HyperGraph expand() { * 2. The regular CKY part: add completed items onto the chart via cube * pruning. */ - logger.finest("Adding complete items into chart"); + LOG.debug("Adding complete items into chart"); completeSpan(i, j); /* 3. Process unary rules. */ - logger.finest("Adding unary items into chart"); + LOG.debug("Adding unary items into chart"); addUnaryNodes(this.grammars, i, j); // (4)=== in dot_cell(i,j), add dot-nodes that start from the /complete/ // superIterms in // chart_cell(i,j) - logger.finest("Initializing new dot-items that start from complete items in this cell"); + LOG.debug("Initializing new dot-items that start from complete items in this cell"); for (int k = 0; k < this.grammars.length; k++) { if (this.grammars[k].hasRuleForSpan(i, j, inputLattice.distance(i, j))) { this.dotcharts[k].startDotItems(i, j); @@ -621,18 +618,18 @@ public HyperGraph expand() { } } - logStatistics(Level.INFO); + logStatistics(); // transition_final: setup a goal item, which may have many deductions if (null == this.cells.get(0, sourceLength) || !this.goalBin.transitToGoal(this.cells.get(0, sourceLength), this.featureFunctions, this.sourceLength)) { - Decoder.LOG(1, String.format("Input %d: Parse failure (either no derivations exist or pruning is too aggressive", - sentence.id())); + LOG.info("Input {}: Parse failure (either no derivations exist or pruning is too aggressive", + sentence.id()); return null; } - logger.fine("Finished expand"); + LOG.debug("Finished expand"); return new HyperGraph(this.goalBin.getSortedNodes().get(0), -1, -1, this.sentence); } @@ -657,9 +654,9 @@ public Cell getCell(int i, int j) { // Private methods // =============================================================== - private void logStatistics(Level level) { - Decoder.LOG(2, String.format("Input %d: Chart: added %d merged %d dot-items added: %d", - this.sentence.id(), this.nAdded, this.nMerged, this.nDotitemAdded)); + private void logStatistics() { + LOG.info("Input {}: Chart: added {} merged {} dot-items added: {}", + this.sentence.id(), this.nAdded, this.nMerged, this.nDotitemAdded); } /** @@ -683,8 +680,7 @@ private int addUnaryNodes(Grammar[] grammars, int i, int j) { ArrayList queue = new ArrayList(chartBin.getSortedNodes()); HashSet seen_lhs = new HashSet(); - if (logger.isLoggable(Level.FINEST)) - logger.finest("Adding unary to [" + i + ", " + j + "]"); + LOG.debug("Adding unary to [{}, {}]", i, j); while (queue.size() > 0) { HGNode node = queue.remove(0); @@ -713,9 +709,7 @@ private int addUnaryNodes(Grammar[] grammars, int i, int j) { HGNode resNode = chartBin.addHyperEdgeInCell(states, rule, i, j, antecedents, new SourcePath(), true); - if (logger.isLoggable(Level.FINEST)) - logger.finest(rule.toString()); - + LOG.debug("{}", rule); if (null != resNode && !seen_lhs.contains(resNode.lhs)) { queue.add(resNode); qtyAdditionsToQueue++; @@ -728,7 +722,7 @@ private int addUnaryNodes(Grammar[] grammars, int i, int j) { } /*** - * Add a terminal production (X -> english phrase) to the hypergraph. + * Add a terminal production (X -> english phrase) to the hypergraph. * * @param i the start index * @param j stop index diff --git a/src/joshua/decoder/chart_parser/ComputeNodeResult.java b/src/main/java/org/apache/joshua/decoder/chart_parser/ComputeNodeResult.java similarity index 73% rename from src/joshua/decoder/chart_parser/ComputeNodeResult.java rename to src/main/java/org/apache/joshua/decoder/chart_parser/ComputeNodeResult.java index 373ed402..e691da6f 100644 --- a/src/joshua/decoder/chart_parser/ComputeNodeResult.java +++ b/src/main/java/org/apache/joshua/decoder/chart_parser/ComputeNodeResult.java @@ -16,31 +16,35 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.chart_parser; +package org.apache.joshua.decoder.chart_parser; import java.util.ArrayList; import java.util.List; -import joshua.decoder.Decoder; -import joshua.decoder.ff.StatefulFF; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.hypergraph.HyperEdge; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.ff.StatefulFF; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.hypergraph.HyperEdge; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class computes the cost of applying a rule. * - * @author Matt Post - * @author Zhifei Li, + * @author Matt Post post@cs.jhu.edu + * @author Zhifei Li, zhifei.work@gmail.com */ public class ComputeNodeResult { + private static final Logger LOG = LoggerFactory.getLogger(ComputeNodeResult.class); + // The cost incurred by the rule itself (and all associated feature functions) private float transitionCost; @@ -52,13 +56,20 @@ public class ComputeNodeResult { // The StateComputer objects themselves serve as keys. private List dpStates; - + /** * Computes the new state(s) that are produced when applying the given rule to the list of tail * nodes. Also computes a range of costs of doing so (the transition cost, the total (Viterbi) * cost, and a score that includes a future cost estimate). * * Old version that doesn't use the derivation state. + * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s + * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to use when computing th node result + * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode}'s + * @param i todo + * @param j todo + * @param sourcePath information about a path taken through the source lattice + * @param sentence the lattice input */ public ComputeNodeResult(List featureFunctions, Rule rule, List tailNodes, int i, int j, SourcePath sourcePath, Sentence sentence) { @@ -66,12 +77,9 @@ public ComputeNodeResult(List featureFunctions, Rule rule, List // The total Viterbi cost of this edge. This is the Viterbi cost of the tail nodes, plus // whatever costs we incur applying this rule to create a new hyperedge. float viterbiCost = 0.0f; - - if (Decoder.VERBOSE >= 4) { - System.err.println("ComputeNodeResult():"); - System.err.println("-> RULE " + rule); - } - + LOG.debug("ComputeNodeResult():"); + LOG.info("-> RULE {}", rule); + /* * Here we sum the accumulated cost of each of the tail nodes. The total cost of the new * hyperedge (the inside or Viterbi cost) is the sum of these nodes plus the cost of the @@ -80,10 +88,8 @@ public ComputeNodeResult(List featureFunctions, Rule rule, List */ if (null != tailNodes) { for (HGNode item : tailNodes) { - if (Decoder.VERBOSE >= 4) { - System.err.println(" -> item.bestedge: " + item); - System.err.println("-> TAIL NODE " + item); - } + LOG.info("-> item.bestedge: {}", item); + LOG.info("-> TAIL NODE {}", item); viterbiCost += item.bestHyperedge.getBestDerivationScore(); } } @@ -95,7 +101,7 @@ public ComputeNodeResult(List featureFunctions, Rule rule, List // The future cost estimate is a heuristic estimate of the outside cost of this edge. float futureCostEstimate = 0.0f; - + /* * We now iterate over all the feature functions, computing their cost and their expected future * cost. @@ -105,32 +111,29 @@ public ComputeNodeResult(List featureFunctions, Rule rule, List DPState newState = feature.compute(rule, tailNodes, i, j, sourcePath, sentence, acc); transitionCost += acc.getScore(); - - if (Decoder.VERBOSE >= 4) - System.err.println(String.format("-> FEATURE %s = %.3f * %.3f = %.3f", - feature.getName(), acc.getScore() / Decoder.weights.getSparse(feature.getName()), - Decoder.weights.getSparse(feature.getName()), acc.getScore())); + + + LOG.debug("FEATURE {} = {} * {} = {}", feature.getName(), + acc.getScore() / Decoder.weights.getSparse(feature.getName()), + Decoder.weights.getSparse(feature.getName()), acc.getScore()); if (feature.isStateful()) { futureCostEstimate += feature.estimateFutureCost(rule, newState, sentence); allDPStates.add(((StatefulFF)feature).getStateIndex(), newState); } } - viterbiCost += transitionCost; - - if (Decoder.VERBOSE >= 4) - System.err.println(String.format("-> COST = %.3f", transitionCost)); - + LOG.debug("-> COST = {}", transitionCost); // Set the final results. this.pruningCostEstimate = viterbiCost + futureCostEstimate; this.viterbiCost = viterbiCost; this.transitionCost = transitionCost; this.dpStates = allDPStates; } - + /** - * This is called from Cell.java when making the final transition to the goal state. + * This is called from {@link org.apache.joshua.decoder.chart_parser.Cell} + * when making the final transition to the goal state. * This is done to allow feature functions to correct for partial estimates, since * they now have the knowledge that the whole sentence is complete. Basically, this * is only used by LanguageModelFF, which does not score partial n-grams, and therefore @@ -140,6 +143,14 @@ public ComputeNodeResult(List featureFunctions, Rule rule, List * too: it makes search better (more accurate at the beginning, for example), and would * also do away with the need for the computeFinal* class of functions (and hooks in * the feature function interface). + * + * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s + * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode}'s + * @param i todo + * @param j todo + * @param sourcePath information about a path taken through the source lattice + * @param sentence the lattice input + * @return the final cost for the Node */ public static float computeFinalCost(List featureFunctions, List tailNodes, int i, int j, SourcePath sourcePath, Sentence sentence) { @@ -150,13 +161,13 @@ public static float computeFinalCost(List featureFunctions, } return cost; } - + public static FeatureVector computeTransitionFeatures(List featureFunctions, HyperEdge edge, int i, int j, Sentence sentence) { // Initialize the set of features with those that were present with the rule in the grammar. FeatureVector featureDelta = new FeatureVector(); - + // === compute feature logPs for (FeatureFunction ff : featureFunctions) { // A null rule signifies the final transition. @@ -166,7 +177,7 @@ public static FeatureVector computeTransitionFeatures(List feat featureDelta.add(ff.computeFeatures(edge.getRule(), edge.getTailNodes(), i, j, edge.getSourcePath(), sentence)); } } - + return featureDelta; } @@ -176,11 +187,12 @@ public float getPruningEstimate() { /** * The complete cost of the Viterbi derivation at this point + * @return float representing cost */ public float getViterbiCost() { return this.viterbiCost; } - + public float getBaseCost() { return getViterbiCost() - getTransitionCost(); } @@ -188,7 +200,7 @@ public float getBaseCost() { /** * The cost incurred by this edge alone * - * @return + * @return float representing cost */ public float getTransitionCost() { return this.transitionCost; diff --git a/src/joshua/decoder/chart_parser/CubePruneState.java b/src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java similarity index 92% rename from src/joshua/decoder/chart_parser/CubePruneState.java rename to src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java index c9ee8e6c..7c2fe5c4 100644 --- a/src/joshua/decoder/chart_parser/CubePruneState.java +++ b/src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java @@ -16,16 +16,16 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.chart_parser; +package org.apache.joshua.decoder.chart_parser; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.chart_parser.DotChart.DotNode; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.chart_parser.DotChart.DotNode; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; // =============================================================== // CubePruneState class diff --git a/src/joshua/decoder/chart_parser/DotChart.java b/src/main/java/org/apache/joshua/decoder/chart_parser/DotChart.java similarity index 95% rename from src/joshua/decoder/chart_parser/DotChart.java rename to src/main/java/org/apache/joshua/decoder/chart_parser/DotChart.java index b82b68c8..7d1479e0 100644 --- a/src/joshua/decoder/chart_parser/DotChart.java +++ b/src/main/java/org/apache/joshua/decoder/chart_parser/DotChart.java @@ -16,26 +16,26 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.chart_parser; +package org.apache.joshua.decoder.chart_parser; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.logging.Level; -import java.util.logging.Logger; - -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.tm.Grammar; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.RuleCollection; -import joshua.decoder.ff.tm.Trie; -import joshua.decoder.segment_file.Token; -import joshua.lattice.Arc; -import joshua.lattice.Lattice; -import joshua.lattice.Node; -import joshua.util.ChartSpan; + +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.tm.Grammar; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.RuleCollection; +import org.apache.joshua.decoder.ff.tm.Trie; +import org.apache.joshua.decoder.segment_file.Token; +import org.apache.joshua.lattice.Arc; +import org.apache.joshua.lattice.Lattice; +import org.apache.joshua.lattice.Node; +import org.apache.joshua.util.ChartSpan; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * The DotChart handles Earley-style implicit binarization of translation rules. @@ -58,6 +58,13 @@ */ class DotChart { + // =============================================================== + // Static fields + // =============================================================== + + private static final Logger LOG = LoggerFactory.getLogger(DotChart.class); + + // =============================================================== // Package-protected instance fields // =============================================================== @@ -94,11 +101,6 @@ public DotCell getDotCell(int i, int j) { /* If enabled, rule terminals are treated as regular expressions. */ private final boolean regexpMatching; - // =============================================================== - // Static fields - // =============================================================== - - private static final Logger logger = Logger.getLogger(DotChart.class.getName()); // =============================================================== // Constructors @@ -169,8 +171,7 @@ void seed() { * */ void expandDotCell(int i, int j) { - if (logger.isLoggable(Level.FINEST)) - logger.finest("Expanding dot cell (" + i + "," + j + ")"); + LOG.debug("Expanding dot cell ({}, {})", i, j); /* * (1) If the dot is just to the left of a non-terminal variable, we look for theorems or axioms @@ -352,15 +353,15 @@ private void addDotItem(Trie tnode, int i, int j, ArrayList antSuperN dotcells.get(i, j).addDotNode(item); dotChart.nDotitemAdded++; - if (logger.isLoggable(Level.FINEST)) { - logger.finest(String.format("Add a dotitem in cell (%d, %d), n_dotitem=%d, %s", i, j, - dotChart.nDotitemAdded, srcPath)); + if (LOG.isDebugEnabled()) { + LOG.debug("Add a dotitem in cell ({}, {}), n_dotitem={}, {}", i, j, + dotChart.nDotitemAdded, srcPath); RuleCollection rules = tnode.getRuleCollection(); if (rules != null) { for (Rule r : rules.getRules()) { // System.out.println("rule: "+r.toString()); - logger.finest(r.toString()); + LOG.debug("{}", r); } } } diff --git a/src/joshua/decoder/chart_parser/ManualConstraintsHandler.java b/src/main/java/org/apache/joshua/decoder/chart_parser/ManualConstraintsHandler.java similarity index 83% rename from src/joshua/decoder/chart_parser/ManualConstraintsHandler.java rename to src/main/java/org/apache/joshua/decoder/chart_parser/ManualConstraintsHandler.java index baed9849..3b7c6446 100644 --- a/src/joshua/decoder/chart_parser/ManualConstraintsHandler.java +++ b/src/main/java/org/apache/joshua/decoder/chart_parser/ManualConstraintsHandler.java @@ -16,26 +16,28 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.chart_parser; +package org.apache.joshua.decoder.chart_parser; import java.util.ArrayList; import java.util.HashMap; import java.util.List; -import java.util.logging.Level; -import java.util.logging.Logger; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.tm.Grammar; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.segment_file.ConstraintRule; -import joshua.decoder.segment_file.ConstraintSpan; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.tm.Grammar; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.segment_file.ConstraintRule; +import org.apache.joshua.decoder.segment_file.ConstraintSpan; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * @author Zhifei Li, + * @author Zhifei Li, zhifei.work@gmail.com */ public class ManualConstraintsHandler { + private static final Logger LOG = LoggerFactory.getLogger(ManualConstraintsHandler.class); + // TODO: each span only has one ConstraintSpan // contain spans that have LHS or RHS constraints (they are always hard) private HashMap constraintSpansForFiltering; @@ -43,11 +45,9 @@ public class ManualConstraintsHandler { // contain spans that have hard "rule" constraint; key: start_span; value: // end_span private ArrayList spansWithHardRuleConstraint; - private Chart chart; - private Grammar grammarForConstructManualRule; - private static final Logger logger = Logger.getLogger(ManualConstraintsHandler.class.getName()); + private Grammar grammarForConstructManualRule; public ManualConstraintsHandler(Chart chart, Grammar grammarForConstructManualRule, List constraintSpans) { @@ -109,19 +109,16 @@ private void initialize(List constraintSpans) { // add to the chart chart.addAxiom(cSpan.start(), cSpan.end(), rule, new SourcePath()); - if (logger.isLoggable(Level.INFO)) - logger.info("Adding RULE constraint for span " + cSpan.start() + ", " - + cSpan.end() + "; isHard=" + cSpan.isHard() + rule.getLHS()); + LOG.info("Adding RULE constraint for span {}, {}; isHard={}", + cSpan.start(), cSpan.end(), cSpan.isHard() + "" + rule.getLHS()); break; - default: shouldAdd = true; } } if (shouldAdd) { - if (logger.isLoggable(Level.INFO)) - logger.info("Adding LHS or RHS constraint for span " + cSpan.start() + ", " - + cSpan.end()); + LOG.info("Adding LHS or RHS constraint for span {}, {}", + cSpan.start(), cSpan.end()); if (null == this.constraintSpansForFiltering) { this.constraintSpansForFiltering = new HashMap(); } @@ -141,6 +138,11 @@ private void initialize(List constraintSpans) { /** * if there are any LHS or RHS constraints for a span, then all the applicable grammar rules in * that span will have to pass the filter. + * + * @param i LHS of span, used for genrating the span signature + * @param j RHS of span, used for genrating the span signature + * @param rulesIn {@link java.util.List} of {@link org.apache.joshua.decoder.ff.tm.Rule}'s + * @return filtered {@link java.util.List} of {@link org.apache.joshua.decoder.ff.tm.Rule}'s */ public List filterRules(int i, int j, List rulesIn) { if (null == this.constraintSpansForFiltering) return rulesIn; @@ -165,6 +167,9 @@ public List filterRules(int i, int j, List rulesIn) { /** * should we filter out the gRule based on the manually provided constraint cRule + * @param cRule constraint rule + * @param gRule rule which may be filtered + * @return true if this gRule should survive */ public boolean shouldSurvive(ConstraintRule cRule, Rule gRule) { @@ -189,6 +194,9 @@ public boolean shouldSurvive(ConstraintRule cRule, Rule gRule) { /** * if a span is *within* the coverage of a *hard* rule constraint, then this span will be only * allowed to use the mannual rules + * @param startSpan beginning node (int) for span + * @param endSpan end node (int) for span + * @return true if this span containers a rule constraint */ public boolean containHardRuleConstraint(int startSpan, int endSpan) { if (null != this.spansWithHardRuleConstraint) { diff --git a/src/joshua/decoder/chart_parser/SourcePath.java b/src/main/java/org/apache/joshua/decoder/chart_parser/SourcePath.java similarity index 87% rename from src/joshua/decoder/chart_parser/SourcePath.java rename to src/main/java/org/apache/joshua/decoder/chart_parser/SourcePath.java index b1fbe096..1d961490 100644 --- a/src/joshua/decoder/chart_parser/SourcePath.java +++ b/src/main/java/org/apache/joshua/decoder/chart_parser/SourcePath.java @@ -16,15 +16,15 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.chart_parser; +package org.apache.joshua.decoder.chart_parser; -import joshua.decoder.segment_file.Token; -import joshua.lattice.Arc; +import org.apache.joshua.decoder.segment_file.Token; +import org.apache.joshua.lattice.Arc; /** * This class represents information about a path taken through the source lattice. * - * @note This implementation only tracks the source path cost which is assumed to be a scalar value. + *

This implementation only tracks the source path cost which is assumed to be a scalar value. * If you need multiple values, or want to recover more detailed path statistics, you'll need * to update this code. */ diff --git a/src/joshua/decoder/chart_parser/StateConstraint.java b/src/main/java/org/apache/joshua/decoder/chart_parser/StateConstraint.java similarity index 87% rename from src/joshua/decoder/chart_parser/StateConstraint.java rename to src/main/java/org/apache/joshua/decoder/chart_parser/StateConstraint.java index e17cee04..d21ceca4 100644 --- a/src/joshua/decoder/chart_parser/StateConstraint.java +++ b/src/main/java/org/apache/joshua/decoder/chart_parser/StateConstraint.java @@ -16,20 +16,20 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.chart_parser; +package org.apache.joshua.decoder.chart_parser; import java.util.Collection; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.state_maintenance.NgramDPState; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState; /** * This class provides constraints on the sorts of states that are permitted in the chart. Its * original motivation was to be used as a means of doing forced decoding, which is accomplished by * forcing all n-gram states that are created to match the target string. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu * */ public class StateConstraint { @@ -43,7 +43,7 @@ public StateConstraint(String target) { * Determines if all of the states passed in are legal in light of the input that was passed * earlier. Currently only defined for n-gram states. * - * @param dpStates + * @param dpStates {@link java.util.Collection} of {@link org.apache.joshua.decoder.ff.state_maintenance.DPState}'s * @return whether the states are legal in light of the target side sentence */ public boolean isLegal(Collection dpStates) { diff --git a/src/joshua/decoder/chart_parser/SuperNode.java b/src/main/java/org/apache/joshua/decoder/chart_parser/SuperNode.java similarity index 94% rename from src/joshua/decoder/chart_parser/SuperNode.java rename to src/main/java/org/apache/joshua/decoder/chart_parser/SuperNode.java index 6ed4bcd7..a7c6e346 100644 --- a/src/joshua/decoder/chart_parser/SuperNode.java +++ b/src/main/java/org/apache/joshua/decoder/chart_parser/SuperNode.java @@ -16,12 +16,12 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.chart_parser; +package org.apache.joshua.decoder.chart_parser; import java.util.ArrayList; import java.util.List; -import joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.hypergraph.HGNode; /** * Represents a list of items in the hypergraph that have the same left-hand side but may have diff --git a/src/main/java/org/apache/joshua/decoder/chart_parser/package-info.java b/src/main/java/org/apache/joshua/decoder/chart_parser/package-info.java new file mode 100644 index 00000000..8bf73baa --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/chart_parser/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * Provides an implementation of a hierarchical phrase-based + * decoder for statistical machine translation. The code in + * this package is based largely on algorithms from Chiang (2007). + */ +package org.apache.joshua.decoder.chart_parser; diff --git a/src/joshua/decoder/ff/ArityPhrasePenalty.java b/src/main/java/org/apache/joshua/decoder/ff/ArityPhrasePenalty.java similarity index 75% rename from src/joshua/decoder/ff/ArityPhrasePenalty.java rename to src/main/java/org/apache/joshua/decoder/ff/ArityPhrasePenalty.java index 82238994..d4f9534a 100644 --- a/src/joshua/decoder/ff/ArityPhrasePenalty.java +++ b/src/main/java/org/apache/joshua/decoder/ff/ArityPhrasePenalty.java @@ -16,25 +16,25 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.List; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; -import joshua.decoder.chart_parser.SourcePath; -import joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.corpus.Vocabulary; /** * This feature function counts rules from a particular grammar (identified by the owner) having an * arity within a specific range. It expects three parameters upon initialization: the owner, the * minimum arity, and the maximum arity. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu + * @author Zhifei Li zhifei.work@gmail.com */ public class ArityPhrasePenalty extends StatelessFF { @@ -69,4 +69,16 @@ public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePa return null; } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/FeatureFunction.java b/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java similarity index 72% rename from src/joshua/decoder/ff/FeatureFunction.java rename to src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java index 40b92b3a..e5f0baaa 100644 --- a/src/joshua/decoder/ff/FeatureFunction.java +++ b/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.ArrayList; import java.util.HashMap; @@ -24,34 +24,35 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; /** - * This class defines Joshua's feature function interface, for both sparse and + *

This class defines Joshua's feature function interface, for both sparse and * dense features. It is immediately inherited by StatelessFF and StatefulFF, * which provide functionality common to stateless and stateful features, * respectively. Any feature implementation should extend those classes, and not * this one. The distinction between stateless and stateful features is somewhat * narrow: all features have the opportunity to return an instance of a - * {@link DPState} object, and stateless ones just return null. + * {@link DPState} object, and stateless ones just return null.

* - * Features in Joshua work like templates. Each feature function defines any + *

Features in Joshua work like templates. Each feature function defines any * number of actual features, which are associated with weights. The task of the * feature function is to compute the features that are fired in different * circumstances and then return the inner product of those features with the * weight vector. Feature functions can also produce estimates of their future - * cost (via {@link estimateCost()}); these values are not used in computing the + * cost (via {@link org.apache.joshua.decoder.ff.FeatureFunction#estimateCost(Rule, Sentence)}); + * these values are not used in computing the * score, but are only used for sorting rules during cube pruning. The * individual features produced by each template should have globally unique * names; a good convention is to prefix each feature with the name of the - * template that produced it. + * template that produced it.

* - * Joshua does not retain individual feature values while decoding, since this + *

Joshua does not retain individual feature values while decoding, since this * requires keeping a sparse feature vector along every hyperedge, which can be * expensive. Instead, it computes only the weighted cost of each edge. If the * individual feature values are requested, the feature functions are replayed @@ -59,10 +60,10 @@ * a generic way by passing an {@link Accumulator} object to the compute() * function. During decoding, the accumulator simply sums weighted features in a * scalar. During k-best extraction, when individual feature values are needed, - * a {@link FeatureAccumulator} is used to retain the individual values. + * a {@link FeatureAccumulator} is used to retain the individual values.

* - * @author Matt Post - * @author Juri Ganitkevich + * @author Matt Post post@cs.jhu.edu + * @author Juri Ganitkevich juri@cs.jhu.edu */ public abstract class FeatureFunction { @@ -72,7 +73,7 @@ public abstract class FeatureFunction { * names, for templates that define multiple features. */ protected String name = null; - + /* * The list of features each function can contribute, along with the dense feature IDs. */ @@ -93,14 +94,14 @@ public abstract class FeatureFunction { * instantiated */ protected FeatureVector weights; - + /* The config */ protected JoshuaConfiguration config; public String getName() { return name; } - + // Whether the feature has state. public abstract boolean isStateful(); @@ -112,7 +113,7 @@ public FeatureFunction(FeatureVector weights, String name, String[] args, Joshua this.parsedArgs = FeatureFunction.parseArgs(args); } - + /** * Any feature function can use this to report dense features names to the master code. The * parameter tells the feature function the index of the first available dense feature ID; the feature @@ -135,22 +136,23 @@ public String logString() { /** * This is the main function for defining feature values. The implementor - * should compute all the features along the hyperedge, calling acc.put(name, - * value) for each feature. It then returns the newly-computed dynamic + * should compute all the features along the hyperedge, calling + * {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator#add(String, float)} + * for each feature. It then returns the newly-computed dynamic * programming state for this feature (for example, for the - * {@link LanguageModelFF} feature, this returns the new language model + * {@link org.apache.joshua.decoder.ff.lm.LanguageModelFF} feature, this returns the new language model * context). For stateless features, this value is null. * * Note that the accumulator accumulates *unweighted* feature values. The * feature vector is multiplied times the weight vector later on. * - * @param rule - * @param tailNodes - * @param i - * @param j - * @param sourcePath - * @param sentID - * @param acc + * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation + * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode} tail nodes + * @param i todo + * @param j todo + * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice} + * @param sentence {@link org.apache.joshua.lattice.Lattice} input + * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation * @return the new dynamic programming state (null for stateless features) */ public abstract DPState compute(Rule rule, List tailNodes, int i, int j, @@ -160,12 +162,12 @@ public abstract DPState compute(Rule rule, List tailNodes, int i, int j, * Feature functions must overrided this. StatefulFF and StatelessFF provide * reasonable defaults since most features do not fire on the goal node. * - * @param tailNode - * @param i - * @param j - * @param sourcePath - * @param sentID - * @param acc + * @param tailNode single {@link org.apache.joshua.decoder.hypergraph.HGNode} representing tail node + * @param i todo + * @param j todo + * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice} + * @param sentence {@link org.apache.joshua.lattice.Lattice} input + * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation * @return the DPState (null if none) */ public abstract DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath, @@ -181,12 +183,12 @@ public abstract DPState computeFinal(HGNode tailNode, int i, int j, SourcePath s * incorporate the feature weights. This function is used in the kbest * extraction code but could also be used in computing the cost. * - * @param rule - * @param tailNodes - * @param i - * @param j - * @param sourcePath - * @param sentID + * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation + * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode} tail nodes + * @param i todo + * @param j todo + * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice} + * @param sentence {@link org.apache.joshua.lattice.Lattice} input * @return an *unweighted* feature delta */ public final FeatureVector computeFeatures(Rule rule, List tailNodes, int i, int j, @@ -203,11 +205,11 @@ public final FeatureVector computeFeatures(Rule rule, List tailNodes, in * return the *weighted* cost of applying the feature. Provided for backward * compatibility. * - * @param tailNode - * @param i - * @param j - * @param sourcePath - * @param sentID + * @param tailNode single {@link org.apache.joshua.decoder.hypergraph.HGNode} representing tail node + * @param i todo + * @param j todo + * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice} + * @param sentence {@link org.apache.joshua.lattice.Lattice} input * @return a *weighted* feature cost */ public final float computeFinalCost(HGNode tailNode, int i, int j, SourcePath sourcePath, @@ -222,12 +224,12 @@ public final float computeFinalCost(HGNode tailNode, int i, int j, SourcePath so * Returns the *unweighted* feature delta for the final transition (e.g., for * the language model feature function). Provided for backward compatibility. * - * @param tailNode - * @param i - * @param j - * @param sourcePath - * @param sentID - * @return + * @param tailNode single {@link org.apache.joshua.decoder.hypergraph.HGNode} representing tail node + * @param i todo + * @param j todo + * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice} + * @param sentence {@link org.apache.joshua.lattice.Lattice} input + * @return an *weighted* feature vector */ public final FeatureVector computeFinalFeatures(HGNode tailNode, int i, int j, SourcePath sourcePath, Sentence sentence) { @@ -247,6 +249,8 @@ public final FeatureVector computeFinalFeatures(HGNode tailNode, int i, int j, * sorting. Later, the real cost of this feature function is called via * compute(); * + * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation + * @param sentence {@link org.apache.joshua.lattice.Lattice} input * @return the *weighted* cost of applying the feature. */ public abstract float estimateCost(Rule rule, Sentence sentence); @@ -257,9 +261,9 @@ public final FeatureVector computeFinalFeatures(HGNode tailNode, int i, int j, * score but is used in pruning decisions. Stateless features return 0.0f by * default, but Stateful features might want to override this. * - * @param rule - * @param state - * @param sentence + * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation + * @param state todo + * @param sentence {@link org.apache.joshua.lattice.Lattice} input * @return the *weighted* future cost estimate of applying this rule in * context. */ @@ -271,7 +275,7 @@ public final FeatureVector computeFinalFeatures(HGNode tailNode, int i, int j, * Any key without a value is added with an empty string as value Multiple values for the same key * are not parsed. The first one is used. * - * @param rawArgs A string with the raw arguments and their names + * @param args A string with the raw arguments and their names * @return A hash with the keys and the values of the string */ public static HashMap parseArgs(String[] args) { @@ -303,13 +307,25 @@ public static HashMap parseArgs(String[] args) { return parsedArgs; } + /** + * It is used when initializing translation grammars (for + * pruning purpose, and to get stateless logP for each rule). + * This is also required to sort the rules (required by Cube-pruning). + * + * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation + * @param sentID associated ID + * @return double value representing LogP + */ + public abstract double estimateLogP(Rule rule, int sentID); + + public abstract double getWeight(); + /** * Accumulator objects allow us to generalize feature computation. * ScoreAccumulator takes (feature,value) pairs and simple stores the weighted * sum (for decoding). FeatureAccumulator records the named feature values * (for k-best extraction). */ - public interface Accumulator { public void add(String name, float value); public void add(int id, float value); @@ -326,7 +342,7 @@ public ScoreAccumulator() { public void add(String name, float value) { score += value * weights.getSparse(name); } - + @Override public void add(int id, float value) { score += value * weights.getDense(id); @@ -348,7 +364,7 @@ public FeatureAccumulator() { public void add(String name, float value) { features.increment(name, value); } - + @Override public void add(int id, float value) { features.increment(id, value); diff --git a/src/joshua/decoder/ff/FeatureVector.java b/src/main/java/org/apache/joshua/decoder/ff/FeatureVector.java similarity index 90% rename from src/joshua/decoder/ff/FeatureVector.java rename to src/main/java/org/apache/joshua/decoder/ff/FeatureVector.java index dcbcda2e..778997ea 100644 --- a/src/joshua/decoder/ff/FeatureVector.java +++ b/src/main/java/org/apache/joshua/decoder/ff/FeatureVector.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.ArrayList; import java.util.Collections; @@ -35,9 +35,11 @@ * queries each of them for their sparse features via {@link registerDenseFeatures}. Those features * returned by each decoder are then *removed* from the sparse feature hash and placed in the dense * feature array. Therefore, when a feature registers a dense feature, it should take care to - * query either {@link getDense()} or {@link getSparse} when asking for the feature values later on. + * query either {@link org.apache.joshua.decoder.ff.FeatureVector#getDense(int)} or + * {@link org.apache.joshua.decoder.ff.FeatureVector#getSparse(String)} when asking for the feature + * values later on. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class FeatureVector { @@ -75,8 +77,8 @@ public FeatureVector() { * **IMPORTANT** The feature values are inverted, for historical reasons, which leads to a lot * of confusion. They have to be inverted here and when the score is actually computed. They * are inverted here (which is used to build the feature vector representation of a rule's dense - * features) and in {@link BilingualRule::estimateRuleCost()}, where the rule's precomputable - * (weighted) score is cached. + * features) and in {@link org.apache.joshua.decoder.ff.tm.BilingualRule#estimateRuleCost(java.util.List)} + * , where the rule's precomputable (weighted) score is cached. * * @param featureString, the string of labeled and unlabeled features (probably straight from the * grammar text file) @@ -138,8 +140,7 @@ public FeatureVector(String featureString, String prefix) { * can infer them all). This *must* be called by every feature function wishing to register * dense features! * - * @param names - * @return + * @param featureFunctions {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s */ public void registerDenseFeatures(ArrayList featureFunctions) { for (FeatureFunction feature: featureFunctions) { @@ -181,6 +182,8 @@ public FeatureVector clone() { * Subtracts the weights in the other feature vector from this one. Note that this is not set * subtraction; keys found in the other FeatureVector but not in this one will be initialized with * a value of 0.0f before subtraction. + * + * @param other another {@link org.apache.joshua.decoder.ff.FeatureVector} from which to subtract its score */ public void subtract(FeatureVector other) { for (int i = 0; i < denseFeatures.size(); i++) @@ -195,6 +198,8 @@ public void subtract(FeatureVector other) { /** * Adds the weights in the other feature vector to this one. This is set union, with values shared * between the two being summed. + * + * @param other another {@link org.apache.joshua.decoder.ff.FeatureVector} from which to add its score */ public void add(FeatureVector other) { while (denseFeatures.size() < other.denseFeatures.size()) @@ -214,6 +219,8 @@ public void add(FeatureVector other) { /** * Return the weight of a feature by name, after checking to determine if it is sparse or dense. * + * @param feature String name of some feature + * @return the feature's weight */ public float getWeight(String feature) { for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) { @@ -227,7 +234,7 @@ public float getWeight(String feature) { /** * Return the weight of a sparse feature, indexed by its name. * - * @param feature + * @param feature String name of some feature * @return the sparse feature's weight, or 0 if not found. */ public float getSparse(String feature) { @@ -244,7 +251,7 @@ public boolean hasValue(String name) { * Return the weight of a dense feature, indexed by its feature index, or 0.0f, if the feature * is not found. In other words, this is a safe way to query the dense feature vector. * - * @param id + * @param id int representing of some dense feature * @return the dense feature's value, or 0 if not found. */ public float getDense(int id) { @@ -267,8 +274,8 @@ public void increment(int id, float value) { * Set the value of a feature. We need to first determine whether the feature is a dense or * sparse one, then set accordingly. * - * @param feature - * @param value + * @param feature String name of some feature + * @param value float value to set to the featue with the associated name */ public void set(String feature, float value) { for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) { @@ -293,6 +300,9 @@ public Map getMap() { /** * Computes the inner product between this feature vector and another one. + * + * @param other a {@link org.apache.joshua.decoder.ff.FeatureVector} with which to compute the inner product + * @return float value representing the computation */ public float innerProduct(FeatureVector other) { float cost = 0.0f; @@ -313,6 +323,8 @@ public void times(float value) { /*** * Moses distinguishes sparse features as those containing an underscore, so we have to fake it * to be compatible with their tuners. + * + * @return trimmed Moses output string */ public String mosesString() { StringBuilder outputString = new StringBuilder(); diff --git a/src/joshua/decoder/ff/LabelCombinationFF.java b/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java similarity index 76% rename from src/joshua/decoder/ff/LabelCombinationFF.java rename to src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java index 38a85db7..f80e0b7f 100644 --- a/src/joshua/decoder/ff/LabelCombinationFF.java +++ b/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; /*** * @author Gideon Wenniger @@ -24,12 +24,12 @@ import java.util.List; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; public class LabelCombinationFF extends StatelessFF { @@ -60,4 +60,16 @@ public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePa return null; } + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } + } diff --git a/src/joshua/decoder/ff/LabelSubstitutionFF.java b/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java similarity index 88% rename from src/joshua/decoder/ff/LabelSubstitutionFF.java rename to src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java index 0f703724..2c247fe2 100644 --- a/src/joshua/decoder/ff/LabelSubstitutionFF.java +++ b/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; /*** * @author Gideon Wenniger @@ -24,13 +24,13 @@ import java.util.List; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; -import joshua.util.ListUtil; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.joshua.util.ListUtil; public class LabelSubstitutionFF extends StatelessFF { private static final String MATCH_SUFFIX = "MATCH"; @@ -129,4 +129,16 @@ public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePa return null; } + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } + } diff --git a/src/joshua/decoder/ff/OOVPenalty.java b/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java similarity index 83% rename from src/joshua/decoder/ff/OOVPenalty.java rename to src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java index 6a065487..69584ddc 100644 --- a/src/joshua/decoder/ff/OOVPenalty.java +++ b/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java @@ -16,20 +16,20 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.ArrayList; import java.util.HashMap; import java.util.List; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.JoshuaConfiguration.OOVItem; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; -import joshua.corpus.Vocabulary; -import joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.chart_parser.SourcePath; /** * This feature is fired when an out-of-vocabulary word (with respect to the translation model) is @@ -39,7 +39,7 @@ * "mark-oovs") . These rules are all stored in a grammar whose owner is "oov". The OOV feature * function template then fires the "OOVPenalty" feature whenever it is asked to score an OOV rule. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class OOVPenalty extends StatelessFF { private int ownerID = -1; @@ -102,4 +102,16 @@ public float estimateCost(Rule rule, Sentence sentence) { private float getValue(int lhs) { return oovWeights.containsKey(lhs) ? oovWeights.get(lhs) : defaultValue; } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/PhraseModel.java b/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java similarity index 85% rename from src/joshua/decoder/ff/PhraseModel.java rename to src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java index 9882bc1d..3eb0c2e8 100644 --- a/src/joshua/decoder/ff/PhraseModel.java +++ b/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java @@ -16,19 +16,19 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.ArrayList; import java.util.List; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Grammar; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Grammar; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; /** * This feature handles the list of features that are found with grammar rules in the grammar file. @@ -37,8 +37,8 @@ * queries the weights for the set of features that are active for this grammar, storing them in an * array. * - * @author Matt Post - * @author Zhifei Li + * @author Matt Post post@cs.jhu.edu + * @author Zhifei Li zhifei.work@gmail.com */ public class PhraseModel extends StatelessFF { @@ -132,4 +132,16 @@ public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePa public String toString() { return name + " " + Vocabulary.word(ownerID); } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/PhrasePenalty.java b/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java similarity index 78% rename from src/joshua/decoder/ff/PhrasePenalty.java rename to src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java index fa6a3d1a..a185286e 100644 --- a/src/joshua/decoder/ff/PhrasePenalty.java +++ b/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java @@ -16,19 +16,19 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.ArrayList; import java.util.List; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.phrase.Hypothesis; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.phrase.Hypothesis; +import org.apache.joshua.decoder.segment_file.Sentence; /** * This feature just counts rules that are used. You can restrict it with a number of flags: @@ -83,4 +83,16 @@ public float estimateCost(Rule rule, Sentence sentence) { return weights.getDense(denseFeatureIndex) * value; return 0.0f; } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/RuleCountBin.java b/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java similarity index 69% rename from src/joshua/decoder/ff/RuleCountBin.java rename to src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java index cd7d9e72..55abd510 100644 --- a/src/joshua/decoder/ff/RuleCountBin.java +++ b/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java @@ -16,23 +16,27 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.List; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /* * This feature computes a bin for the rule and activates a feature for it. It requires access to * the index of the RarityPenalty field, from which the rule count can be computed. */ public class RuleCountBin extends StatelessFF { + + private static final Logger LOG = LoggerFactory.getLogger(RuleCountBin.class); private int field = -1; public RuleCountBin(FeatureVector weights, String[] args, JoshuaConfiguration config) { @@ -61,10 +65,22 @@ public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePa } } - System.err.println(String.format("RuleCountBin(%f) = %d ==> %s", rarityPenalty, count, feature)); + LOG.debug("RuleCountBin({}) = {} ==> {}", rarityPenalty, count, feature); acc.add(feature, 1.0f); return null; } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/RuleFF.java b/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java similarity index 80% rename from src/joshua/decoder/ff/RuleFF.java rename to src/main/java/org/apache/joshua/decoder/ff/RuleFF.java index 9fb7d3e5..bc6d67b6 100644 --- a/src/joshua/decoder/ff/RuleFF.java +++ b/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.List; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; /** * This feature just counts rules that are used. You can restrict it with a number of flags: @@ -85,4 +85,16 @@ private String getRuleString(Rule rule) { } return ruleString.replaceAll("[ =]", "~"); } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/RuleLength.java b/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java similarity index 79% rename from src/joshua/decoder/ff/RuleLength.java rename to src/main/java/org/apache/joshua/decoder/ff/RuleLength.java index 645905a5..59b1c207 100644 --- a/src/joshua/decoder/ff/RuleLength.java +++ b/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java @@ -16,22 +16,22 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.List; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; /* * This feature computes three feature templates: a feature indicating the length of the rule's * source side, its target side, and a feature that pairs them. */ -public class RuleLength extends StatelessFF { +public abstract class RuleLength extends StatelessFF { public RuleLength(FeatureVector weights, String[] args, JoshuaConfiguration config) { super(weights, "RuleLength", args, config); diff --git a/src/joshua/decoder/ff/RulePropertiesQuerying.java b/src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java similarity index 89% rename from src/joshua/decoder/ff/RulePropertiesQuerying.java rename to src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java index 777c7902..a1867a35 100644 --- a/src/joshua/decoder/ff/RulePropertiesQuerying.java +++ b/src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java @@ -16,13 +16,13 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.ArrayList; import java.util.List; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; public class RulePropertiesQuerying { diff --git a/src/joshua/decoder/ff/RuleShape.java b/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java similarity index 79% rename from src/joshua/decoder/ff/RuleShape.java rename to src/main/java/org/apache/joshua/decoder/ff/RuleShape.java index e243528f..a5140216 100644 --- a/src/joshua/decoder/ff/RuleShape.java +++ b/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java @@ -16,16 +16,16 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.List; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; /* * Implements the RuleShape feature for source, target, and paired source+target sides. @@ -70,4 +70,16 @@ public DPState compute(Rule rule, List tailNodes, int i_, int j, SourceP return null; } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/SourceDependentFF.java b/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java similarity index 90% rename from src/joshua/decoder/ff/SourceDependentFF.java rename to src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java index 2f490fa9..841402ab 100644 --- a/src/joshua/decoder/ff/SourceDependentFF.java +++ b/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java @@ -16,9 +16,9 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.segment_file.Sentence; public interface SourceDependentFF extends Cloneable { diff --git a/src/joshua/decoder/ff/SourcePathFF.java b/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java similarity index 72% rename from src/joshua/decoder/ff/SourcePathFF.java rename to src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java index 68dc595d..d5295599 100644 --- a/src/joshua/decoder/ff/SourcePathFF.java +++ b/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java @@ -16,24 +16,24 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.ArrayList; import java.util.List; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; /** * This feature returns the scored path through the source lattice, which is recorded in a * SourcePath object. * - * @author Chris Dyer - * @author Matt Post + * @author Chris Dyer redpony@umd.edu + * @author Matt Post post@cs.jhu.edu */ public final class SourcePathFF extends StatelessFF { @@ -60,4 +60,16 @@ public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePa acc.add(denseFeatureIndex, sourcePath.getPathCost()); return null; } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/StatefulFF.java b/src/main/java/org/apache/joshua/decoder/ff/StatefulFF.java similarity index 79% rename from src/joshua/decoder/ff/StatefulFF.java rename to src/main/java/org/apache/joshua/decoder/ff/StatefulFF.java index 4ec2e57d..e55e6a78 100644 --- a/src/joshua/decoder/ff/StatefulFF.java +++ b/src/main/java/org/apache/joshua/decoder/ff/StatefulFF.java @@ -16,17 +16,19 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.List; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Stateful features contribute dynamic programming state. Unlike earlier versions of Joshua, the @@ -35,11 +37,12 @@ * state-contributing objects in each HGNode. State can no longer be shared among different feature * functions. * - * @author Matt Post - * @author Juri Ganitkevich + * @author Matt Post post@cs.jhu.edu + * @author Juri Ganitkevich juri@cs.jhu.edu */ public abstract class StatefulFF extends FeatureFunction { + private static final Logger LOG = LoggerFactory.getLogger(StatefulFF.class); /* Every stateful FF takes a unique index value and increments this. */ static int GLOBAL_STATE_INDEX = 0; @@ -49,7 +52,7 @@ public abstract class StatefulFF extends FeatureFunction { public StatefulFF(FeatureVector weights, String name, String[] args, JoshuaConfiguration config) { super(weights, name, args, config); - Decoder.LOG(1, "Stateful object with state index " + GLOBAL_STATE_INDEX); + LOG.info("Stateful object with state index {}", GLOBAL_STATE_INDEX); stateIndex = GLOBAL_STATE_INDEX++; } diff --git a/src/joshua/decoder/ff/StatelessFF.java b/src/main/java/org/apache/joshua/decoder/ff/StatelessFF.java similarity index 83% rename from src/joshua/decoder/ff/StatelessFF.java rename to src/main/java/org/apache/joshua/decoder/ff/StatelessFF.java index 198219bd..e473c370 100644 --- a/src/joshua/decoder/ff/StatelessFF.java +++ b/src/main/java/org/apache/joshua/decoder/ff/StatelessFF.java @@ -16,23 +16,23 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.List; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; /** * Stateless feature functions do not contribute any state. You need not implement this class to * create a stateless feature function, but it provides a few convenience functions. * - * @author Matt Post - * @author Juri Ganitkevich + * @author Matt Post post@cs.jhu.edu + * @author Juri Ganitkevich juri@cs.jhu.edu */ public abstract class StatelessFF extends FeatureFunction { diff --git a/src/joshua/decoder/ff/TargetBigram.java b/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java similarity index 79% rename from src/joshua/decoder/ff/TargetBigram.java rename to src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java index 846273dd..9e1b06c5 100644 --- a/src/joshua/decoder/ff/TargetBigram.java +++ b/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java @@ -16,52 +16,52 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.io.IOException; import java.util.HashSet; -import java.util.LinkedList; +import java.util.LinkedList; import java.util.List; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.state_maintenance.NgramDPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.joshua.util.io.LineReader; /*** * The RuleBigram feature is an indicator feature that counts target word bigrams that are created when * a rule is applied. It accepts three parameters: - * + * * -vocab /path/to/vocab - * + * * The path to a vocabulary, where each line is of the format ID WORD COUNT. - * + * * -threshold N - * + * * Mask to UNK all words whose COUNT is less than N. - * + * * -top-n N - * + * * Only use the top N words. */ public class TargetBigram extends StatefulFF { - + private HashSet vocab = null; private int maxTerms = 1000000; private int threshold = 0; public TargetBigram(FeatureVector weights, String[] args, JoshuaConfiguration config) { super(weights, "TargetBigram", args, config); - + if (parsedArgs.containsKey("threshold")) threshold = Integer.parseInt(parsedArgs.get("threshold")); - + if (parsedArgs.containsKey("top-n")) maxTerms = Integer.parseInt(parsedArgs.get("top-n")); @@ -72,11 +72,11 @@ public TargetBigram(FeatureVector weights, String[] args, JoshuaConfiguration co /** * Load vocabulary items passing the 'threshold' and 'top-n' filters. - * + * * @param filename */ private void loadVocab(String filename) { - this.vocab = new HashSet(); + this.vocab = new HashSet(); this.vocab.add(""); this.vocab.add(""); try { @@ -84,18 +84,18 @@ private void loadVocab(String filename) { for (String line: lineReader) { if (lineReader.lineno() > maxTerms) break; - + String[] tokens = line.split("\\s+"); String word = tokens[1]; int count = Integer.parseInt(tokens[2]); - + if (count >= threshold) vocab.add(word); } } catch (IOException e) { - System.err.println(String.format("* FATAL: couldn't load TargetBigram vocabulary '%s'", filename)); - System.exit(1); + throw new RuntimeException(String.format( + "* FATAL: couldn't load TargetBigram vocabulary '%s'", filename), e); } } @@ -107,7 +107,7 @@ public DPState compute(Rule rule, List tailNodes, int spanStart, int spa int left = -1; int right = -1; - + List currentNgram = new LinkedList(); for (int c = 0; c < enWords.length; c++) { int curID = enWords[c]; @@ -127,7 +127,7 @@ public DPState compute(Rule rule, List tailNodes, int spanStart, int spa if (currentNgram.size() == 2) { String ngram = join(currentNgram); acc.add(String.format("%s_%s", name, ngram), 1); -// System.err.println(String.format("ADDING %s_%s", name, ngram)); + // System.err.println(String.format("ADDING %s_%s", name, ngram)); currentNgram.remove(0); } } @@ -144,20 +144,20 @@ public DPState compute(Rule rule, List tailNodes, int spanStart, int spa if (currentNgram.size() == 2) { String ngram = join(currentNgram); acc.add(String.format("%s_%s", name, ngram), 1); -// System.err.println(String.format("ADDING %s_%s", name, ngram)); + // System.err.println(String.format("ADDING %s_%s", name, ngram)); currentNgram.remove(0); } } } NgramDPState state = new NgramDPState(new int[] { left }, new int[] { right }); -// System.err.println(String.format("RULE %s -> state %s", rule.getRuleString(), state)); + // System.err.println(String.format("RULE %s -> state %s", rule.getRuleString(), state)); return state; } /** * Returns the word after comparing against the private vocabulary (if set). - * + * * @param curID * @return the word */ @@ -165,9 +165,9 @@ private String getWord(int curID) { String word = Vocabulary.word(curID); if (vocab != null && ! vocab.contains(word)) { - return "UNK"; + return "UNK"; } - + return word; } @@ -180,13 +180,13 @@ public float estimateFutureCost(Rule rule, DPState state, Sentence sentence) { } /** - * There is nothing to be done here, since and are included in rules that are part + * There is nothing to be done here, since <s> and </s> are included in rules that are part * of the grammar. We simply return the DP state of the tail node. */ @Override public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath, Sentence sentence, Accumulator acc) { - + return tailNode.getDPState(stateIndex); } @@ -200,7 +200,7 @@ public float estimateCost(Rule rule, Sentence sentence) { /** * Join a list with the _ character. I am sure this is in a library somewhere. - * + * * @param list a list of strings * @return the joined String */ @@ -212,4 +212,16 @@ private String join(List list) { return sb.substring(0, sb.length() - 1); } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/WordPenalty.java b/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java similarity index 76% rename from src/joshua/decoder/ff/WordPenalty.java rename to src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java index 583b59c9..62c889f8 100644 --- a/src/joshua/decoder/ff/WordPenalty.java +++ b/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java @@ -16,23 +16,23 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff; +package org.apache.joshua.decoder.ff; import java.util.ArrayList; import java.util.List; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.phrase.Hypothesis; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.phrase.Hypothesis; +import org.apache.joshua.decoder.segment_file.Sentence; /** * - * @author Zhifei Li - * @author Matt Post + * @author Zhifei Li zhifei.work@gmail.com + * @author Matt Post post@cs.jhu.edu */ public final class WordPenalty extends StatelessFF { @@ -75,4 +75,16 @@ public float estimateCost(Rule rule, Sentence sentence) { return weights.getDense(denseFeatureIndex) * OMEGA * (rule.getEnglish().length - rule.getArity()); return 0.0f; } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/fragmentlm/ConcatenationIterator.java b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/ConcatenationIterator.java similarity index 91% rename from src/joshua/decoder/ff/fragmentlm/ConcatenationIterator.java rename to src/main/java/org/apache/joshua/decoder/ff/fragmentlm/ConcatenationIterator.java index b19d8979..f75dffa1 100644 --- a/src/joshua/decoder/ff/fragmentlm/ConcatenationIterator.java +++ b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/ConcatenationIterator.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.fragmentlm; +package org.apache.joshua.decoder.ff.fragmentlm; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; /** * Concatenates an iterator over iterators into one long iterator. diff --git a/src/joshua/decoder/ff/fragmentlm/FragmentLMFF.java b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java similarity index 75% rename from src/joshua/decoder/ff/fragmentlm/FragmentLMFF.java rename to src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java index 0375dc03..b191c2f2 100644 --- a/src/joshua/decoder/ff/fragmentlm/FragmentLMFF.java +++ b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.fragmentlm; +package org.apache.joshua.decoder.ff.fragmentlm; import java.io.IOException; import java.util.ArrayList; @@ -26,48 +26,52 @@ import java.util.List; import java.util.Stack; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.StatefulFF; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.format.HieroFormatReader; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.hypergraph.HyperEdge; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.StatefulFF; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.hypergraph.HyperEdge; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Feature function that reads in a list of language model fragments and matches them against the + *

Feature function that reads in a list of language model fragments and matches them against the * hypergraph. This allows for language model fragment "glue" features, which fire when LM fragments * (supplied as input) are assembled. These LM fragments are presumably useful in ensuring - * grammaticality and can be independent of the translation model fragments. + * grammaticality and can be independent of the translation model fragments.

* - * Usage: in the Joshua Configuration file, put + *

Usage: in the Joshua Configuration file, put

* - * feature-function = FragmentLM -lm LM_FRAGMENTS_FILE -map RULE_FRAGMENTS_MAP_FILE + * feature-function = FragmentLM -lm LM_FRAGMENTS_FILE -map RULE_FRAGMENTS_MAP_FILE * - * LM_FRAGMENTS_FILE is a pointer to a file containing a list of fragments that it should look for. - * The format of the file is one fragment per line in PTB format, e.g.: + *

LM_FRAGMENTS_FILE is a pointer to a file containing a list of fragments that it should look for. + * The format of the file is one fragment per line in PTB format, e.g.:

* - * (S NP (VP (VBD said) SBAR) (. .)) + * (S NP (VP (VBD said) SBAR) (. .)) * - * RULE_FRAGMENTS_MAP_FILE points to a file that maps fragments to the flattened SCFG rule format + *

RULE_FRAGMENTS_MAP_FILE points to a file that maps fragments to the flattened SCFG rule format * that Joshua uses. This mapping is necessary because Joshua's rules have been flattened, meaning * that their internal structure has been removed, yet this structure is needed for matching LM - * fragments. The format of the file is + * fragments. The format of the file is

* - * FRAGMENT ||| RULE-TARGET-SIDE + * FRAGMENT ||| RULE-TARGET-SIDE * - * for example, + *

for example,

* - * (S (NP (DT the) (NN man)) VP .) ||| the man [VP,1] [.,2] (SBAR (IN that) (S (NP (PRP he)) (VP - * (VBD was) (VB done)))) ||| that he was done (VP (VBD said) SBAR) ||| said SBAR + * (S (NP (DT the) (NN man)) VP .) ||| the man [VP,1] [.,2] (SBAR (IN that) (S (NP (PRP he)) (VP + * (VBD was) (VB done)))) ||| that he was done (VP (VBD said) SBAR) ||| said SBAR * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class FragmentLMFF extends StatefulFF { + private static final Logger LOG = LoggerFactory.getLogger(FragmentLMFF.class); + /* * When building a fragment from a rule rooted in the hypergraph, this parameter determines how * deep we'll go. Smaller values mean less hypergraph traversal but may also limit the LM @@ -104,9 +108,9 @@ public class FragmentLMFF extends StatefulFF { private String fragmentLMFile = ""; /** - * @param weights - * @param name - * @param stateComputer + * @param weights a {@link org.apache.joshua.decoder.ff.FeatureVector} with weights + * @param args arguments passed to the feature function + * @param config the {@link org.apache.joshua.decoder.JoshuaConfiguration} */ public FragmentLMFF(FeatureVector weights, String[] args, JoshuaConfiguration config) { super(weights, "FragmentLMFF", args, config); @@ -128,18 +132,16 @@ public FragmentLMFF(FeatureVector weights, String[] args, JoshuaConfiguration co // lmFragments.get(lmFragments.size()-1))); } } catch (IOException e) { - System.err.println(String.format("* WARNING: couldn't read fragment LM file '%s'", - fragmentLMFile)); - System.exit(1); + throw new RuntimeException(String.format("* WARNING: couldn't read fragment LM file '%s'", + fragmentLMFile), e); } - System.err.println(String.format("FragmentLMFF: Read %d LM fragments from '%s'", numFragments, - fragmentLMFile)); + LOG.info("FragmentLMFF: Read {} LM fragments from '{}'", numFragments, fragmentLMFile); } /** * Add the provided fragment to the language model, subject to some filtering. * - * @param fragment + * @param fragment a {@link org.apache.joshua.decoder.ff.fragmentlm.Tree} fragment */ public void addLMFragment(Tree fragment) { if (lmFragments == null) @@ -148,19 +150,18 @@ public void addLMFragment(Tree fragment) { int fragmentDepth = fragment.getDepth(); if (MAX_DEPTH != 0 && fragmentDepth > MAX_DEPTH) { - System.err.println(String.format(" Skipping fragment %s (depth %d > %d)", fragment, - fragmentDepth, MAX_DEPTH)); + LOG.warn("Skipping fragment {} (depth {} > {})", fragment, fragmentDepth, MAX_DEPTH); return; } if (MIN_LEX_DEPTH > 1 && fragment.isLexicalized() && fragmentDepth < MIN_LEX_DEPTH) { - System.err.println(String.format(" Skipping fragment %s (lex depth %d < %d)", fragment, - fragmentDepth, MIN_LEX_DEPTH)); + LOG.warn("Skipping fragment {} (lex depth {} < {})", fragment, fragmentDepth, MIN_LEX_DEPTH); return; } - if (lmFragments.get(fragment.getRule()) == null) + if (lmFragments.get(fragment.getRule()) == null) { lmFragments.put(fragment.getRule(), new ArrayList()); + } lmFragments.get(fragment.getRule()).add(fragment); numFragments++; } @@ -170,6 +171,15 @@ public void addLMFragment(Tree fragment) { * that fire are any LM fragments that match the fragment associated with the current rule. LM * fragments may recurse over the tail nodes, following 1-best backpointers until the fragment * either matches or fails. + * + * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation + * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode} tail nodes + * @param i todo + * @param j todo + * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice} + * @param sentence {@link org.apache.joshua.lattice.Lattice} input + * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation + * @return the new dynamic programming state (null for stateless features) */ @Override public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePath sourcePath, @@ -315,14 +325,14 @@ public static void main(String[] args) { Tree tree = Tree.buildTree(ruleS, tailNodes, 1); boolean matched = fragmentLMFF.match(fragment, tree); - System.err.println(String.format("Does\n %s match\n %s??\n -> %s", fragment, tree, matched)); + LOG.info("Does\n {} match\n {}??\n -> {}", fragment, tree, matched); } /** * Maintains a state pointer used by KenLM to implement left-state minimization. * - * @author Matt Post - * @author Juri Ganitkevitch + * @author Matt Post post@cs.jhu.edu + * @author Juri Ganitkevitch juri@cs.jhu.edu */ public class FragmentState extends DPState { @@ -353,4 +363,16 @@ public String toString() { } } + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } + } diff --git a/src/joshua/decoder/ff/fragmentlm/PennTreebankReader.java b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/PennTreebankReader.java similarity index 98% rename from src/joshua/decoder/ff/fragmentlm/PennTreebankReader.java rename to src/main/java/org/apache/joshua/decoder/ff/fragmentlm/PennTreebankReader.java index 6ab52e1a..1637b5f4 100644 --- a/src/joshua/decoder/ff/fragmentlm/PennTreebankReader.java +++ b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/PennTreebankReader.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.fragmentlm; +package org.apache.joshua.decoder.ff.fragmentlm; import java.util.*; import java.io.*; diff --git a/src/joshua/decoder/ff/fragmentlm/Tree.java b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/Tree.java similarity index 89% rename from src/joshua/decoder/ff/fragmentlm/Tree.java rename to src/main/java/org/apache/joshua/decoder/ff/fragmentlm/Tree.java index b52cccef..07c7ecd4 100644 --- a/src/joshua/decoder/ff/fragmentlm/Tree.java +++ b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/Tree.java @@ -16,20 +16,22 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.fragmentlm; +package org.apache.joshua.decoder.ff.fragmentlm; import java.io.IOException; import java.io.Serializable; import java.io.StringReader; import java.util.*; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.fragmentlm.Trees.PennTreeReader; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.hypergraph.HyperEdge; -import joshua.decoder.hypergraph.KBestExtractor.DerivationState; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.fragmentlm.Trees.PennTreeReader; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.hypergraph.HyperEdge; +import org.apache.joshua.decoder.hypergraph.KBestExtractor.DerivationState; +import org.apache.joshua.util.io.LineReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Represent phrase-structure trees, with each node consisting of a label and a list of children. @@ -39,10 +41,11 @@ * enclosed in double-quotes when read in. * * @author Dan Klein - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class Tree implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(Tree.class); private static final long serialVersionUID = 1L; protected int label; @@ -112,7 +115,7 @@ public int getLabel() { /** * Computes the depth-one rule rooted at this node. If the node has no children, null is returned. * - * @return + * @return string representation of the rule */ public String getRule() { if (isLeaf()) { @@ -238,6 +241,8 @@ private static void appendPreTerminalYield(Tree tree, List yield) { * A tree is lexicalized if it has terminal nodes among the leaves of its frontier. For normal * trees this is always true since they bottom out in terminals, but for fragments, this may or * may not be true. + * + * @return true if the tree is lexicalized */ public boolean isLexicalized() { if (this.numLexicalItems < 0) { @@ -314,7 +319,7 @@ public String toString() { * Removes the quotes around terminals. Note that the resulting tree could not be read back * in by this class, since unquoted leaves are interpreted as nonterminals. * - * @return + * @return unquoted string */ public String unquotedString() { return toString().replaceAll("\"", ""); @@ -451,8 +456,8 @@ else if (tree.getChildren().get(0).isPreTerminal()) * models. The arguments have to be passed in to preserve Java generics, even though this is only * ever used with String versions. * - * @param sos presumably "" - * @param eos presumably "" + * @param sos presumably "<s>" + * @param eos presumably "</s>" */ public void insertSentenceMarkers(String sos, String eos) { insertSentenceMarker(sos, 0); @@ -466,8 +471,8 @@ public void insertSentenceMarkers() { /** * - * @param symbol - * @param pos + * @param symbol the marker to insert + * @param pos the position at which to insert */ private void insertSentenceMarker(String symbol, int pos) { @@ -488,6 +493,9 @@ private void insertSentenceMarker(String symbol, int pos) { /** * This is a convenience function for producing a fragment from its string representation. + * + * @param ptbStr input string from which to produce a fragment + * @return the fragment */ public static Tree fromString(String ptbStr) { PennTreeReader reader = new PennTreeReader(new StringReader(ptbStr)); @@ -510,20 +518,18 @@ public static void readMapping(String fragmentMappingFile) { for (String line : reader) { String[] fields = line.split("\\s+\\|{3}\\s+"); if (fields.length != 2 || !fields[0].startsWith("(")) { - System.err.println(String.format("* WARNING: malformed line %d: %s", reader.lineno(), - line)); + LOG.warn("malformed line {}: {}", reader.lineno(), line); continue; } rulesToFragmentStrings.put(fields[1].trim(), fields[0].trim()); // buildFragment(fields[0])); } } catch (IOException e) { - System.err.println(String.format("* WARNING: couldn't read fragment mapping file '%s'", - fragmentMappingFile)); - System.exit(1); + throw new RuntimeException(String.format("* WARNING: couldn't read fragment mapping file '%s'", + fragmentMappingFile), e); } - System.err.println(String.format("FragmentLMFF: Read %d mappings from '%s'", - rulesToFragmentStrings.size(), fragmentMappingFile)); + LOG.info("FragmentLMFF: Read {} mappings from '{}'", rulesToFragmentStrings.size(), + fragmentMappingFile); } /** @@ -532,14 +538,13 @@ public static void readMapping(String fragmentMappingFile) { * recursively visit the derivation state objects, following the route through the hypergraph * defined by them. * - * This function is like the other buildTree() function, but that one simply follows the best - * incoming hyperedge for each node. + * This function is like Tree#buildTree(DerivationState, int), + * but that one simply follows the best incoming hyperedge for each node. * - * @param rule - * @param tailNodes - * @param derivation - should not be null - * @param maxDepth - * @return + * @param rule for which corresponding internal fragment can be used to initialize the tree + * @param derivationStates array of state objects + * @param maxDepth of route through the hypergraph + * @return the Tree */ public static Tree buildTree(Rule rule, DerivationState[] derivationStates, int maxDepth) { Tree tree = getFragmentFromYield(rule.getEnglishWords()); @@ -549,10 +554,12 @@ public static Tree buildTree(Rule rule, DerivationState[] derivationStates, int } tree = tree.shallowClone(); - - System.err.println(String.format("buildTree(%s)", tree)); - for (int i = 0; i < derivationStates.length; i++) { - System.err.println(String.format(" -> %d: %s", i, derivationStates[i])); + + if (LOG.isDebugEnabled()) { + LOG.debug("buildTree({})", tree); + for (int i = 0; i < derivationStates.length; i++) { + LOG.debug(" -> {}: {}", i, derivationStates[i]); + } } List frontier = tree.getNonterminalYield(); @@ -604,19 +611,14 @@ public static Tree buildTree(Rule rule, DerivationState[] derivationStates, int } /** - * Builds a tree from the kth-best derivation state. This is done by initializing the tree with + *

Builds a tree from the kth-best derivation state. This is done by initializing the tree with * the internal fragment corresponding to the rule; this will be the top of the tree. We then * recursively visit the derivation state objects, following the route through the hypergraph - * defined by them. + * defined by them.

* - * This function is like the other buildTree() function, but that one simply follows the best - * incoming hyperedge for each node. - * - * @param rule - * @param tailNodes - * @param derivation - * @param maxDepth - * @return + * @param derivationState array of state objects + * @param maxDepth of route through the hypergraph + * @return the Tree */ public static Tree buildTree(DerivationState derivationState, int maxDepth) { Rule rule = derivationState.edge.getRule(); @@ -629,7 +631,7 @@ public static Tree buildTree(DerivationState derivationState, int maxDepth) { tree = tree.shallowClone(); - System.err.println(String.format("buildTree(%s)", tree)); + LOG.debug("buildTree({})", tree); if (rule.getArity() > 0 && maxDepth > 0) { List frontier = tree.getNonterminalYield(); @@ -677,9 +679,10 @@ public static Tree buildTree(DerivationState derivationState, int maxDepth) { * This could be implemented by using the other buildTree() function and using the 1-best * DerivationState. * - * @param rule - * @param tailNodes - * @return + * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be used whilst building the tree + * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode}'s + * @param maxDepth to go in the tree + * @return shallow clone of the Tree object */ public static Tree buildTree(Rule rule, List tailNodes, int maxDepth) { Tree tree = getFragmentFromYield(rule.getEnglishWords()); @@ -730,13 +733,13 @@ public static Tree buildTree(Rule rule, List tailNodes, int maxDepth) { frontierTree.children = tree.children; } } catch (IndexOutOfBoundsException e) { - System.err.println(String.format("ERROR at index %d", i)); - System.err.println(String.format("RULE: %s TREE: %s", rule.getEnglishWords(), tree)); - System.err.println(" FRONTIER:"); - for (Tree kid : frontier) - System.err.println(" " + kid); - e.printStackTrace(); - System.exit(1); + LOG.error("ERROR at index {}", i); + LOG.error("RULE: {} TREE: {}", rule.getEnglishWords(), tree); + LOG.error(" FRONTIER:"); + for (Tree kid : frontier) { + LOG.error(" {}", kid); + } + throw new RuntimeException(String.format("ERROR at index %d", i), e); } } } diff --git a/src/joshua/decoder/ff/fragmentlm/Trees.java b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/Trees.java similarity index 95% rename from src/joshua/decoder/ff/fragmentlm/Trees.java rename to src/main/java/org/apache/joshua/decoder/ff/fragmentlm/Trees.java index 94a0f449..d06388cd 100644 --- a/src/joshua/decoder/ff/fragmentlm/Trees.java +++ b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/Trees.java @@ -16,16 +16,18 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.fragmentlm; +package org.apache.joshua.decoder.ff.fragmentlm; import java.io.IOException; import java.io.PushbackReader; import java.io.Reader; import java.io.StringReader; -import java.util.*; - -import joshua.corpus.Vocabulary; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; +import org.apache.joshua.corpus.Vocabulary; /** * Tools for displaying, reading, and modifying trees. Borrowed from the Berkeley Parser. * @@ -166,7 +168,7 @@ public void remove() { } public PennTreeReader(Reader in) { - this.in = new PushbackReader(in); + this.in = new PushbackReader((java.io.Reader) in); nextTree = readRootTree(); // System.out.println(nextTree); } @@ -185,6 +187,9 @@ public static class PennTreeRenderer { * preterminals onto one line of tags and words. Additional complexities are that conjunctions * (tag CC) are not collapsed in this way, and that the unlabeled outer brackets are collapsed * onto the same line as the next bracket down. + * + * @param tree you wish to render and print + * @return a rendered String representation of the tree */ public static String render(Tree tree) { StringBuilder sb = new StringBuilder(); diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/AbstractLM.java b/src/main/java/org/apache/joshua/decoder/ff/lm/AbstractLM.java new file mode 100644 index 00000000..e8225dc4 --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/AbstractLM.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.ff.lm; + +import org.apache.joshua.decoder.Support; +import java.util.List; + +/** + * This class implements NGramLanguageModel by creating wrappers + * around the necessary functions to capture common errors. Most + * methods are declared final, in an attempt to limit what subclasses + * may be defined. + * + * @author Zhifei Li, zhifei.work@gmail.com + * @version $LastChangedDate: 2009-12-30 10:10:38 -0600 (Wed, 30 Dec 2009) $ + */ +public abstract class AbstractLM extends DefaultNGramLanguageModel { + + public AbstractLM(int symbolTable, int order) { + super(symbolTable, order); + } + + @SuppressWarnings("null") + public final double sentenceLogProbability( + List sentence, int order, int startIndex + ) { + //return super.sentenceLogProbability(sentence.stream().toArray(int[]::new) , order, startIndex); + return (Double) null; + } + + public final float ngramLogProbability(int[] ngram) { + return super.ngramLogProbability(ngram); + } + + public final float ngramLogProbability(int[] ngram, int order) { + if (ngram.length > order) { + throw new RuntimeException("ngram length is greather than the max order"); + } + // if (ngram.length==1 && "we".equals(symbolTable.getWord(ngram[0]))) { + // System.err.println("Something weird is about to happen"); + // } + + int historySize = ngram.length - 1; + if (historySize >= order || historySize < 0) { + // BUG: use logger or exception. Don't zero default + throw new RuntimeException("Error: history size is " + historySize); + // return 0; + } + double probability = ngramLogProbability_helper(ngram, order); +// if (probability < -JoshuaConfiguration.lm_ceiling_cost) { +// probability = -JoshuaConfiguration.lm_ceiling_cost; +// } + return (float) probability; + } + + protected abstract float ngramLogProbability_helper(int[] ngram, int order); + + @Deprecated + public final double logProbOfBackoffState(List ngram, int order, int qtyAdditionalBackoffWeight) { + return logProbabilityOfBackoffState( + Support.subIntArray(ngram, 0, ngram.size()), + order, qtyAdditionalBackoffWeight); + } + + + public final double logProbabilityOfBackoffState(int[] ngram, int order, int qtyAdditionalBackoffWeight) { + if (ngram.length > order) { + throw new RuntimeException("ngram length is greather than the max order"); + } + if (ngram[ngram.length-1] != LanguageModelFF.LM_INDEX) { + throw new RuntimeException("last wrd is not "); + } + if (qtyAdditionalBackoffWeight > 0) { + return logProbabilityOfBackoffState_helper( + ngram, order, qtyAdditionalBackoffWeight); + } else { + return 0.0; + } + } + + + protected abstract double logProbabilityOfBackoffState_helper( + int[] ngram, int order, int qtyAdditionalBackoffWeight); + + + // BUG: We should have different classes based on the configuration in use + public int[] leftEquivalentState(int[] originalState, int order, + double[] cost + ) { +// if (JoshuaConfiguration.use_left_equivalent_state) +// throw new UnsupportedOperationException("getLeftEquivalentState is not overwritten by a concrete class"); + + return originalState; + } + + + // BUG: We should have different classes based on the configuration in use + public int[] rightEquivalentState(int[] originalState, int order) { +// if ( !JoshuaConfiguration.use_right_equivalent_state +// || originalState.length != this.ngramOrder-1) { + return originalState; +// } else { +// throw new UnsupportedOperationException("getRightEquivalentState is not overwritten by a concrete class"); +// } + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaFile.java b/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaFile.java new file mode 100644 index 00000000..01d2f391 --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaFile.java @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.ff.lm; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.Scanner; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.zip.GZIPInputStream; + +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.util.Regex; +import org.apache.joshua.util.io.LineReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility class for reading ARPA language model files. + * + * @author Lane Schwartz + */ +public class ArpaFile implements Iterable { + + private static final Logger LOG = LoggerFactory.getLogger(ArpaFile.class); + + /** Regular expression representing a blank line. */ + public static final Regex BLANK_LINE = new Regex("^\\s*$"); + + /** + * Regular expression representing a line + * starting a new section of n-grams in an ARPA language model file. + */ + public static final Regex NGRAM_HEADER = new Regex("^\\\\\\d-grams:\\s*$"); + + /** + * Regular expression representing a line + * ending an ARPA language model file. + */ + public static final Regex NGRAM_END = new Regex("^\\\\end\\\\s*$"); + + /** ARPA file for this object. */ + private final File arpaFile; + + /** The vocabulary associated with this object. */ + private final Vocabulary vocab; + + /** + * Constructs an object that represents an ARPA language model file. + * + * @param arpaFileName File name of an ARPA language model file + * @param vocab Symbol table to be used by this object + */ + public ArpaFile(String arpaFileName, Vocabulary vocab) { + this.arpaFile = new File(arpaFileName); + this.vocab = vocab; + } + + public ArpaFile(String arpaFileName) throws IOException { + this.arpaFile = new File(arpaFileName); + this.vocab = new Vocabulary(); + + // final Scanner scanner = new Scanner(arpaFile); + + // // Eat initial header lines + // while (scanner.hasNextLine()) { + // String line = scanner.nextLine(); + // logger.finest("Discarding line: " + line); + // if (NGRAM_HEADER.matches(line)) { + // break; + // } + // } + + // int ngramOrder = 1; + + LineReader grammarReader = new LineReader(arpaFileName); + + try { + for (String line : grammarReader) { + + + // while (scanner.hasNext()) { + // + // String line = scanner.nextLine(); + + String[] parts = Regex.spaces.split(line); + if (parts.length > 1) { + String[] words = Regex.spaces.split(parts[1]); + + for (String word : words) { + LOG.debug("Adding to vocab: {}", word); + Vocabulary.addAll(word); + } + } else { + LOG.info(line); + } + + } + } finally { + grammarReader.close(); + } + + // + // boolean lineIsHeader = NGRAM_HEADER.matches(line); + // + // while (lineIsHeader || BLANK_LINE.matches(line)) { + // + // if (lineIsHeader) { + // ngramOrder++; + // } + // + // if (scanner.hasNext()) { + // line = scanner.nextLine().trim(); + // lineIsHeader = NGRAM_HEADER.matches(line); + // } else { + // logger.severe("Ran out of lines!"); + // return; + // } + // } + + + // + // // Add word to vocab + // if (logger.isLoggable(Level.FINE)) logger.fine("Adding word to vocab: " + parts[ngramOrder]); + // vocab.addTerminal(parts[ngramOrder]); + // + // // Add context words to vocab + // for (int i=1; i iterator() { + + try { + final Scanner scanner; + + if (arpaFile.getName().endsWith("gz")) { + InputStream in = new GZIPInputStream( + new FileInputStream(arpaFile)); + scanner = new Scanner(in); + } else { + scanner = new Scanner(arpaFile); + } + + // Eat initial header lines + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + LOG.debug("Discarding line: {}", line); + if (NGRAM_HEADER.matches(line)) { + break; + } + } + + return new Iterator() { + + String nextLine = null; + int ngramOrder = 1; + // int id = 0; + + public boolean hasNext() { + + if (scanner.hasNext()) { + + String line = scanner.nextLine(); + + boolean lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line); + + while (lineIsHeader || BLANK_LINE.matches(line)) { + + if (lineIsHeader) { + ngramOrder++; + } + + if (scanner.hasNext()) { + line = scanner.nextLine().trim(); + lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line); + } else { + nextLine = null; + return false; + } + } + + nextLine = line; + return true; + + } else { + nextLine = null; + return false; + } + + } + + public ArpaNgram next() { + if (nextLine!=null) { + + String[] parts = Regex.spaces.split(nextLine); + + float value = Float.valueOf(parts[0]); + + int word = Vocabulary.id(parts[ngramOrder]); + + int[] context = new int[ngramOrder-1]; + for (int i=1; i ngramOrder+1) { + backoff = Float.valueOf(parts[parts.length-1]); + } else { + backoff = ArpaNgram.DEFAULT_BACKOFF; + } + + nextLine = null; + return new ArpaNgram(word, context, value, backoff); + + } else { + throw new NoSuchElementException(); + } + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + }; + } catch (IOException e) { + LOG.error(e.getMessage(), e); + return null; + } + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaNgram.java b/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaNgram.java new file mode 100644 index 00000000..d0077d1b --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaNgram.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.ff.lm; + +/** + * Represents a single n-gram line + * from an ARPA language model file. + * + * @author Lane Schwartz + */ +public class ArpaNgram { + + + /** Indicates an invalid probability value. */ + public static final float INVALID_VALUE = Float.NaN; + + /** Default backoff value. */ + public static final float DEFAULT_BACKOFF = 0.0f; + + private final int word; + private final int[] context; + private final float value; + private final float backoff; + // private final int id; + + public ArpaNgram(int word, int[] context, float value, float backoff) { + this.word = word; + this.context = context; + this.value = value; + this.backoff = backoff; + // this.id = id; + } + + // public int getID() { + // return id; + // } + + public int order() { + return context.length + 1; + } + + public int getWord() { + return word; + } + + public int[] getContext() { + return context; + } + + public float getValue() { + return value; + } + + public float getBackoff() { + return backoff; + } +} \ No newline at end of file diff --git a/src/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java b/src/main/java/org/apache/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java similarity index 82% rename from src/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java rename to src/main/java/org/apache/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java index 20f29f19..d5cf8e94 100644 --- a/src/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java @@ -16,28 +16,27 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.lm; +package org.apache.joshua.decoder.ff.lm; import java.util.Arrays; -import java.util.logging.Level; -import java.util.logging.Logger; -import joshua.corpus.Vocabulary; +import org.apache.joshua.corpus.Vocabulary; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class provides a default implementation for the Equivalent LM State optimization (namely, * don't back off anywhere). It also provides some default implementations for more general - * functions on the interface to fall back to more specific ones (e.g. from ArrayList to - * int[]) and a default implementation for sentenceLogProbability which enumerates the n-grams and - * calls calls ngramLogProbability for each of them. + * functions on the interface to fall back to more specific ones (e.g. from {@link java.util.ArrayList} + * of {@link java.lang.Integer}'s to int[]) and a default implementation for sentenceLogProbability + * which enumerates the n-grams and calls calls ngramLogProbability for each of them. * - * @author Zhifei Li, - * @author wren ng thornton + * @author Zhifei Li, zhifei.work@gmail.com + * @author wren ng thornton wren@users.sourceforge.net */ public abstract class DefaultNGramLanguageModel implements NGramLanguageModel { - /** Logger for this class. */ - private static final Logger logger = Logger.getLogger(DefaultNGramLanguageModel.class.getName()); + private static final Logger LOG = LoggerFactory.getLogger(DefaultNGramLanguageModel.class); protected final int ngramOrder; @@ -88,10 +87,8 @@ public float sentenceLogProbability(int[] sentence, int order, int startIndex) { // start_index=2. othercase, need to check) int[] ngram = Arrays.copyOfRange(sentence, 0, j); double logProb = ngramLogProbability(ngram, order); - if (logger.isLoggable(Level.FINE)) { - String words = Vocabulary.getWords(ngram); - logger.fine("\tlogp ( " + words + " ) = " + logProb); - } + + LOG.debug("\tlogp ({}) = {}", Vocabulary.getWords(ngram), logProb); probability += logProb; } @@ -99,10 +96,7 @@ public float sentenceLogProbability(int[] sentence, int order, int startIndex) { for (int i = 0; i <= sentenceLength - order; i++) { int[] ngram = Arrays.copyOfRange(sentence, i, i + order); double logProb = ngramLogProbability(ngram, order); - if (logger.isLoggable(Level.FINE)) { - String words = Vocabulary.getWords(ngram); - logger.fine("\tlogp ( " + words + " ) = " + logProb); - } + LOG.debug("\tlogp ({}) = {} ", Vocabulary.getWords(ngram), logProb); probability += logProb; } diff --git a/src/joshua/decoder/ff/lm/KenLM.java b/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java similarity index 79% rename from src/joshua/decoder/ff/lm/KenLM.java rename to src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java index 329b631e..0d45879e 100644 --- a/src/joshua/decoder/ff/lm/KenLM.java +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java @@ -16,11 +16,13 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.lm; +package org.apache.joshua.decoder.ff.lm; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.lm.NGramLanguageModel; -import joshua.decoder.ff.state_maintenance.KenLMState; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.lm.NGramLanguageModel; +import org.apache.joshua.decoder.ff.state_maintenance.KenLMState; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * JNI wrapper for KenLM. This version of KenLM supports two use cases, implemented by the separate @@ -29,22 +31,25 @@ * state by itself and just passes in the ngrams for scoring. * * @author Kenneth Heafield - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class KenLM implements NGramLanguageModel, Comparable { + private static final Logger LOG = LoggerFactory.getLogger(KenLM.class); + static { try { System.loadLibrary("ken"); } catch (UnsatisfiedLinkError e) { - System.err.println("* FATAL: Can't find libken.so (libken.dylib on OS X) in $JOSHUA/lib"); - System.err.println("* This probably means that the KenLM library didn't compile."); - System.err.println("* Make sure that BOOST_ROOT is set to the root of your boost"); - System.err.println("* installation (it's not /opt/local/, the default), change to"); - System.err.println("* $JOSHUA, and type 'ant kenlm'. If problems persist, see the"); - System.err.println("* website (joshua-decoder.org)."); - System.exit(1); + //TODO: send these prints to LOG.err + LOG.error("* FATAL: Can't find libken.so (libken.dylib on OS X) in $JOSHUA/lib"); + LOG.error("* This probably means that the KenLM library didn't compile."); + LOG.error("* Make sure that BOOST_ROOT is set to the root of your boost"); + LOG.error("* installation (it's not /opt/local/, the default), change to"); + LOG.error("* $JOSHUA, and type 'ant kenlm'. If problems persist, see the"); + LOG.error("* website (joshua-decoder.org)."); //FIXME: update link to newer url + throw new RuntimeException(e); } } @@ -90,6 +95,7 @@ public KenLM(int order, String file_name) { /** * Constructor if order is not known. * Order will be inferred from the model. + * @param file_name string path to an input file */ public KenLM(String file_name) { pointer = construct(file_name); @@ -115,6 +121,8 @@ public float prob(int[] words) { /** * Query for n-gram probability using strings. + * @param words a string array of words + * @return float value denoting probability */ public float prob(String[] words) { return probForString(pointer, words); @@ -127,14 +135,15 @@ public float probString(int words[], int start) { /** * This function is the bridge to the interface in kenlm/lm/left.hh, which has KenLM score the - * whole rule. It takes a list of words and states retrieved from tail nodes (nonterminals in the + * whole rule. It takes an array of words and states retrieved from tail nodes (nonterminals in the * rule). Nonterminals have a negative value so KenLM can distinguish them. The sentence number is * needed so KenLM knows which memory pool to use. When finished, it returns the updated KenLM * state and the LM probability incurred along this rule. * - * @param words - * @param sentId - * @return + * @param words array of words + * @param poolPointer todo + * @return the updated {@link org.apache.joshua.decoder.ff.lm.KenLM.StateProbPair} e.g. + * KenLM state and the LM probability incurred along this rule */ public StateProbPair probRule(long[] words, long poolPointer) { @@ -153,7 +162,7 @@ public StateProbPair probRule(long[] words, long poolPointer) { * Public facing function that estimates the cost of a rule, which value is used for sorting * rules during cube pruning. * - * @param words + * @param words array of words * @return the estimated cost of the rule (the (partial) n-gram probabilities of all words in the rule) */ public float estimateRule(long[] words) { @@ -161,8 +170,7 @@ public float estimateRule(long[] words) { try { estimate = estimateRule(pointer, words); } catch (NoSuchMethodError e) { - e.printStackTrace(); - System.exit(1); + throw new RuntimeException(e); } return estimate; @@ -170,6 +178,7 @@ public float estimateRule(long[] words) { /** * The start symbol for a KenLM is the Vocabulary.START_SYM. + * @return "<s>" */ public String getStartSymbol() { return Vocabulary.START_SYM; diff --git a/src/joshua/decoder/ff/lm/LanguageModelFF.java b/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java similarity index 83% rename from src/joshua/decoder/ff/lm/LanguageModelFF.java rename to src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java index a002de7c..47410f46 100644 --- a/src/joshua/decoder/ff/lm/LanguageModelFF.java +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.lm; +package org.apache.joshua.decoder.ff.lm; import java.io.IOException; import java.util.ArrayList; @@ -27,35 +27,39 @@ import com.google.common.primitives.Ints; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.Support; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.StatefulFF; -import joshua.decoder.ff.lm.berkeley_lm.LMGrammarBerkeley; -import joshua.decoder.ff.lm.KenLM; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.state_maintenance.NgramDPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.Support; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.StatefulFF; +import org.apache.joshua.decoder.ff.lm.berkeley_lm.LMGrammarBerkeley; +import org.apache.joshua.decoder.ff.lm.KenLM; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class performs the following: *
    *
  1. Gets the additional LM score due to combinations of small items into larger ones by using - * rules - *
  2. Gets the LM state - *
  3. Gets the left-side LM state estimation score + * rules
  4. + *
  5. Gets the LM state
  6. + *
  7. Gets the left-side LM state estimation score
  8. *
* - * @author Matt Post - * @author Juri Ganitkevitch - * @author Zhifei Li, + * @author Matt Post post@cs.jhu.edu + * @author Juri Ganitkevitch juri@cs.jhu.edu + * @author Zhifei Li, zhifei.work@gmail.com */ public class LanguageModelFF extends StatefulFF { + private static final Logger LOG = LoggerFactory.getLogger(LanguageModelFF.class); + public static int LM_INDEX = 0; private int startSymbolId; @@ -65,13 +69,14 @@ public class LanguageModelFF extends StatefulFF { *
    *
  1. We assume it is a backoff lm, and high-order ngram implies low-order ngram; absense of * low-order ngram implies high-order ngram
  2. - *
  3. For a ngram, existence of backoffweight => existence a probability Two ways of dealing with + *
  4. For a ngram, existence of backoffweight => existence a probability Two ways of dealing with * low counts: *
      *
    • SRILM: don't multiply zeros in for unknown words
    • *
    • Pharaoh: cap at a minimum score exp(-10), including unknown words
    • *
    *
  5. + *
*/ protected NGramLanguageModel languageModel; @@ -90,7 +95,7 @@ public class LanguageModelFF extends StatefulFF { /* Whether this is a class-based LM */ private boolean isClassLM; private ClassMap classMap; - + protected class ClassMap { private final int OOV_id = Vocabulary.getUnknownId(); @@ -114,13 +119,14 @@ public int getClassID(int wordID) { private void read(String file_name) throws IOException { int lineno = 0; - for (String line: new joshua.util.io.LineReader(file_name, false)) { + for (String line: new org.apache.joshua.util.io.LineReader(file_name, false)) { lineno++; String[] lineComp = line.trim().split("\\s+"); try { this.classMap.put(Vocabulary.id(lineComp[0]), Vocabulary.id(lineComp[1])); } catch (java.lang.ArrayIndexOutOfBoundsException e) { - System.err.println(String.format("* WARNING: bad vocab line #%d '%s'", lineno, line)); + LOG.warn("bad vocab line #{} '{}'", lineno, line); + LOG.warn(e.getMessage(), e); } } } @@ -133,7 +139,7 @@ public LanguageModelFF(FeatureVector weights, String[] args, JoshuaConfiguration this.type = parsedArgs.get("lm_type"); this.ngramOrder = Integer.parseInt(parsedArgs.get("lm_order")); this.path = parsedArgs.get("lm_file"); - + if (parsedArgs.containsKey("class_map")) try { this.isClassLM = true; @@ -145,14 +151,14 @@ public LanguageModelFF(FeatureVector weights, String[] args, JoshuaConfiguration // The dense feature initialization hasn't happened yet, so we have to retrieve this as sparse this.weight = weights.getSparse(name); - + initializeLM(); } - + @Override public ArrayList reportDenseFeatures(int index) { denseFeatureIndex = index; - + ArrayList names = new ArrayList(); names.add(name); return names; @@ -160,34 +166,30 @@ public ArrayList reportDenseFeatures(int index) { /** * Initializes the underlying language model. - * - * @param config - * @param type - * @param path */ protected void initializeLM() { if (type.equals("kenlm")) { this.languageModel = new KenLM(ngramOrder, path); - + } else if (type.equals("berkeleylm")) { this.languageModel = new LMGrammarBerkeley(ngramOrder, path); } else { - System.err.println(String.format("* FATAL: Invalid backend lm_type '%s' for LanguageModel", type)); - System.err.println(String.format("* Permissible values for 'lm_type' are 'kenlm' and 'berkeleylm'")); - System.exit(-1); + String msg = String.format("* FATAL: Invalid backend lm_type '%s' for LanguageModel", type) + + "* Permissible values for 'lm_type' are 'kenlm' and 'berkeleylm'"; + throw new RuntimeException(msg); } Vocabulary.registerLanguageModel(this.languageModel); Vocabulary.id(config.default_non_terminal); - + startSymbolId = Vocabulary.id(Vocabulary.START_SYM); } public NGramLanguageModel getLM() { return this.languageModel; } - + public String logString() { if (languageModel != null) return String.format("%s, order %d (weight %.3f)", name, languageModel.getOrder(), weight); @@ -220,9 +222,9 @@ public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePa newState = computeTransition(rule.getEnglish(), tailNodes, acc); } } - + } - + return newState; } @@ -230,15 +232,19 @@ public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePa * Input sentences can be tagged with information specific to the language model. This looks for * such annotations by following a word's alignments back to the source words, checking for * annotations, and replacing the surface word if such annotations are found. - * + * @param rule the {@link org.apache.joshua.decoder.ff.tm.Rule} to use + * @param begin todo + * @param end todo + * @param sentence {@link org.apache.joshua.lattice.Lattice} input + * @return todo */ protected int[] getTags(Rule rule, int begin, int end, Sentence sentence) { /* Very important to make a copy here, so the original rule is not modified */ int[] tokens = Arrays.copyOf(rule.getEnglish(), rule.getEnglish().length); byte[] alignments = rule.getAlignment(); -// System.err.println(String.format("getTags() %s", rule.getRuleString())); - + // System.err.println(String.format("getTags() %s", rule.getRuleString())); + /* For each target-side token, project it to each of its source-language alignments. If any of those * are annotated, take the first annotation and quit. */ @@ -249,8 +255,8 @@ protected int[] getTags(Rule rule, int begin, int end, Sentence sentence) { if (alignments[j] == i) { String annotation = sentence.getAnnotation((int)alignments[i] + begin, "class"); if (annotation != null) { -// System.err.println(String.format(" word %d source %d abs %d annotation %d/%s", -// i, alignments[i], alignments[i] + begin, annotation, Vocabulary.word(annotation))); + // System.err.println(String.format(" word %d source %d abs %d annotation %d/%s", + // i, alignments[i], alignments[i] + begin, annotation, Vocabulary.word(annotation))); tokens[i] = Vocabulary.id(annotation); break; } @@ -259,27 +265,27 @@ protected int[] getTags(Rule rule, int begin, int end, Sentence sentence) { } } } - + return tokens; } - + /** * Sets the class map if this is a class LM - * @param classMap - * @throws IOException + * @param fileName a string path to a file + * @throws IOException if there is an error reading the input file */ public void setClassMap(String fileName) throws IOException { this.classMap = new ClassMap(fileName); } - - + /** * Replace each word in a rule with the target side classes. + * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to use when obtaining tokens + * @return int[] of tokens */ protected int[] getClasses(Rule rule) { if (this.classMap == null) { - System.err.println("The class map is not set. Cannot use the class LM "); - System.exit(2); + throw new RuntimeException("The class map is not set. Cannot use the class LM "); } /* Very important to make a copy here, so the original rule is not modified */ int[] tokens = Arrays.copyOf(rule.getEnglish(), rule.getEnglish().length); @@ -372,7 +378,7 @@ private NgramDPState computeTransition(int[] enWords, List tailNodes, Ac int ccount = 0; float transitionLogP = 0.0f; int[] left_context = null; - + for (int c = 0; c < enWords.length; c++) { int curID = enWords[c]; @@ -393,7 +399,7 @@ private NgramDPState computeTransition(int[] enWords, List tailNodes, Ac if (ccount == this.ngramOrder) { // Compute the current word probability, and remove it. float prob = this.languageModel.ngramLogProbability(current, this.ngramOrder); -// System.err.println(String.format("-> prob(%s) = %f", Vocabulary.getWords(current), prob)); + // System.err.println(String.format("-> prob(%s) = %f", Vocabulary.getWords(current), prob)); transitionLogP += prob; System.arraycopy(current, 1, shadow, 0, this.ngramOrder - 1); int[] tmp = current; @@ -412,7 +418,7 @@ private NgramDPState computeTransition(int[] enWords, List tailNodes, Ac if (ccount == this.ngramOrder) { // Compute the current word probability, and remove it.s float prob = this.languageModel.ngramLogProbability(current, this.ngramOrder); -// System.err.println(String.format("-> prob(%s) = %f", Vocabulary.getWords(current), prob)); + // System.err.println(String.format("-> prob(%s) = %f", Vocabulary.getWords(current), prob)); transitionLogP += prob; System.arraycopy(current, 1, shadow, 0, this.ngramOrder - 1); int[] tmp = current; @@ -422,7 +428,7 @@ private NgramDPState computeTransition(int[] enWords, List tailNodes, Ac } } } -// acc.add(name, transitionLogP); + // acc.add(name, transitionLogP); acc.add(denseFeatureIndex, transitionLogP); if (left_context != null) { @@ -444,8 +450,8 @@ private NgramDPState computeTransition(int[] enWords, List tailNodes, Ac */ private NgramDPState computeFinalTransition(NgramDPState state, Accumulator acc) { -// System.err.println(String.format("LanguageModel::computeFinalTransition()")); - + // System.err.println(String.format("LanguageModel::computeFinalTransition()")); + float res = 0.0f; LinkedList currentNgram = new LinkedList(); int[] leftContext = state.getLeftLMStateWords(); @@ -465,14 +471,14 @@ private NgramDPState computeFinalTransition(NgramDPState state, Accumulator acc) } // Tell the accumulator -// acc.add(name, res); + // acc.add(name, res); acc.add(denseFeatureIndex, res); // State is the same return new NgramDPState(leftContext, rightContext); } - + /** * Compatibility method for {@link #scoreChunkLogP(int[], boolean, boolean)} */ @@ -480,7 +486,7 @@ private float scoreChunkLogP(List words, boolean considerIncompleteNgra boolean skipStart) { return scoreChunkLogP(Ints.toArray(words), considerIncompleteNgrams, skipStart); } - + /** * This function is basically a wrapper for NGramLanguageModel::sentenceLogProbability(). It * computes the probability of a phrase ("chunk"), using lower-order n-grams for the first n-1 @@ -509,7 +515,7 @@ private float scoreChunkLogP(int[] words, boolean considerIncompleteNgrams, return score; } - + /** * Public method to set LM_INDEX back to 0. * Required if multiple instances of the JoshuaDecoder live in the same JVM. @@ -517,4 +523,16 @@ private float scoreChunkLogP(int[] words, boolean considerIncompleteNgrams, public static void resetLmIndex() { LM_INDEX = 0; } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/lm/NGramLanguageModel.java b/src/main/java/org/apache/joshua/decoder/ff/lm/NGramLanguageModel.java similarity index 80% rename from src/joshua/decoder/ff/lm/NGramLanguageModel.java rename to src/main/java/org/apache/joshua/decoder/ff/lm/NGramLanguageModel.java index 15da650e..882424b0 100644 --- a/src/joshua/decoder/ff/lm/NGramLanguageModel.java +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/NGramLanguageModel.java @@ -16,16 +16,16 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.lm; +package org.apache.joshua.decoder.ff.lm; /** * An interface for new language models to implement. An object of this type is passed to * LanguageModelFF, which will handle all the dynamic programming and state maintenance. * - * @author wren ng thornton - * @author Zhifei Li, - * @author Matt Post - * @author Juri Ganitkevitch + * @author wren ng thornton wren@users.sourceforge.net + * @author Zhifei Li, zhifei.work@gmail.com + * @author Matt Post post@cs.jhu.edu + * @author Juri Ganitkevitch juri@cs.jhu.edu */ public interface NGramLanguageModel { @@ -41,12 +41,12 @@ public interface NGramLanguageModel { /** * Language models may have their own private vocabulary mapping strings to integers; for example, * if they make use of a compile format (as KenLM and BerkeleyLM do). This mapping is likely - * different from the global mapping containing in joshua.corpus.Vocabulary, which is used to + * different from the global mapping containing in {@link org.apache.joshua.corpus.Vocabulary}, which is used to * convert the input string and grammars. This function is used to tell the language model what * the global mapping is, so that the language model can convert it into its own private mapping. * - * @param word - * @param id + * @param token string token to be registered + * @param id to associate with this word * @return Whether any collisions were detected. */ boolean registerWord(String token, int id); @@ -63,9 +63,9 @@ public interface NGramLanguageModel { /** * Compute the probability of a single word given its context. * - * @param ngram - * @param order - * @return + * @param ngram the NGram for which we wish to compute the probability + * @param order NGram order/context + * @return float representing the probability */ float ngramLogProbability(int[] ngram, int order); diff --git a/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java b/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java similarity index 84% rename from src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java rename to src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java index f07b668c..6869def4 100644 --- a/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java @@ -16,29 +16,29 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.lm; +package org.apache.joshua.decoder.ff.lm; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ConcurrentHashMap; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.lm.KenLM; -import joshua.decoder.ff.lm.KenLM.StateProbPair; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.state_maintenance.KenLMState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.lm.KenLM; +import org.apache.joshua.decoder.ff.lm.KenLM.StateProbPair; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.state_maintenance.KenLMState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; /** * Wrapper for KenLM LMs with left-state minimization. We inherit from the regular * - * @author Matt Post - * @author Juri Ganitkevitch + * @author Matt Post post@cs.jhu.edu + * @author Juri Ganitkevitch juri@cs.jhu.edu */ public class StateMinimizingLanguageModel extends LanguageModelFF { @@ -49,9 +49,9 @@ public StateMinimizingLanguageModel(FeatureVector weights, String[] args, Joshua super(weights, args, config); this.type = "kenlm"; if (parsedArgs.containsKey("lm_type") && ! parsedArgs.get("lm_type").equals("kenlm")) { - System.err.println("* FATAL: StateMinimizingLanguageModel only supports 'kenlm' lm_type backend"); - System.err.println("* Remove lm_type from line or set to 'kenlm'"); - System.exit(-1); + String msg = "* FATAL: StateMinimizingLanguageModel only supports 'kenlm' lm_type backend" + + "* Remove lm_type from line or set to 'kenlm'"; + throw new RuntimeException(msg); } } @@ -66,10 +66,6 @@ public ArrayList reportDenseFeatures(int index) { /** * Initializes the underlying language model. - * - * @param config - * @param type - * @param path */ @Override public void initializeLM() { @@ -162,10 +158,10 @@ public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePa /** * Destroys the pool created to allocate state for this sentence. Called from the - * {@link joshua.decoder.Translation} class after outputting the sentence or k-best list. Hosting + * {@link org.apache.joshua.decoder.Translation} class after outputting the sentence or k-best list. Hosting * this map here in KenLMFF statically allows pools to be shared across KenLM instances. * - * @param sentId + * @param sentId a key in the poolmap table to destroy */ public void destroyPool(int sentId) { if (poolMap.containsKey(sentId)) diff --git a/src/joshua/decoder/ff/lm/berkeley_lm/LICENSE b/src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LICENSE similarity index 100% rename from src/joshua/decoder/ff/lm/berkeley_lm/LICENSE rename to src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LICENSE diff --git a/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java b/src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java similarity index 92% rename from src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java rename to src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java index 27165762..d642fe99 100644 --- a/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.lm.berkeley_lm; +package org.apache.joshua.decoder.ff.lm.berkeley_lm; import java.io.File; import java.util.Arrays; @@ -24,11 +24,12 @@ import java.util.logging.Level; import java.util.logging.Logger; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.ff.lm.DefaultNGramLanguageModel; + import com.google.common.annotations.VisibleForTesting; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.lm.DefaultNGramLanguageModel; -import joshua.decoder.Decoder; import edu.berkeley.nlp.lm.ArrayEncodedNgramLanguageModel; import edu.berkeley.nlp.lm.ConfigOptions; import edu.berkeley.nlp.lm.StringWordIndexer; @@ -36,6 +37,7 @@ import edu.berkeley.nlp.lm.cache.ArrayEncodedCachingLmWrapper; import edu.berkeley.nlp.lm.io.LmReaders; import edu.berkeley.nlp.lm.util.StrUtils; +import org.slf4j.LoggerFactory; /** * This class wraps Berkeley LM. @@ -44,6 +46,8 @@ */ public class LMGrammarBerkeley extends DefaultNGramLanguageModel { + public static final org.slf4j.Logger LOG = LoggerFactory.getLogger(LMGrammarBerkeley.class); + private ArrayEncodedNgramLanguageModel lm; private static final Logger logger = Logger.getLogger(LMGrammarBerkeley.class.getName()); @@ -71,8 +75,7 @@ public LMGrammarBerkeley(int order, String lm_file) { vocabIdToMyIdMapping = new int[10]; if (!new File(lm_file).exists()) { - System.err.println("Can't read lm_file '" + lm_file + "'"); - System.exit(1); + throw new RuntimeException("Can't read lm_file '" + lm_file + "'"); } if (logRequests) { @@ -83,10 +86,10 @@ public LMGrammarBerkeley(int order, String lm_file) { try { // try binary format (even gzipped) lm = (ArrayEncodedNgramLanguageModel) LmReaders.readLmBinary(lm_file); - Decoder.LOG(1, "Loading Berkeley LM from binary " + lm_file); + LOG.info("Loading Berkeley LM from binary {}", lm_file); } catch (RuntimeException e) { ConfigOptions opts = new ConfigOptions(); - Decoder.LOG(1, "Loading Berkeley LM from ARPA file " + lm_file); + LOG.info("Loading Berkeley LM from ARPA file {}", lm_file); final StringWordIndexer wordIndexer = new StringWordIndexer(); ArrayEncodedNgramLanguageModel berkeleyLm = LmReaders.readArrayEncodedLmFromArpa(lm_file, false, wordIndexer, opts, order); diff --git a/src/joshua/decoder/ff/lm/berkeley_lm/README b/src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/README similarity index 100% rename from src/joshua/decoder/ff/lm/berkeley_lm/README rename to src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/README diff --git a/src/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java b/src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java similarity index 95% rename from src/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java rename to src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java index a45dd7f0..e22e6d1d 100644 --- a/src/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java @@ -16,9 +16,9 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.lm.berkeley_lm; +package org.apache.joshua.decoder.ff.lm.berkeley_lm; -import joshua.corpus.Vocabulary; +import org.apache.joshua.corpus.Vocabulary; import edu.berkeley.nlp.lm.WordIndexer; class SymbolTableWrapper implements WordIndexer { diff --git a/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java b/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java similarity index 99% rename from src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java rename to src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java index 7f0b6a4e..a66fa44a 100644 --- a/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.lm.bloomfilter_lm; +package org.apache.joshua.decoder.ff.lm.bloomfilter_lm; import java.io.Externalizable; import java.io.IOException; diff --git a/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java b/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java similarity index 95% rename from src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java rename to src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java index c91fe388..7d0e5991 100644 --- a/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.lm.bloomfilter_lm; +package org.apache.joshua.decoder.ff.lm.bloomfilter_lm; import java.io.Externalizable; import java.io.FileInputStream; @@ -29,14 +29,15 @@ import java.io.ObjectOutput; import java.io.ObjectOutputStream; import java.util.HashMap; -import java.util.logging.Logger; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.lm.DefaultNGramLanguageModel; -import joshua.util.Regex; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.lm.DefaultNGramLanguageModel; +import org.apache.joshua.util.Regex; +import org.apache.joshua.util.io.LineReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * An n-gram language model with linearly-interpolated Witten-Bell smoothing, using a Bloom filter @@ -62,7 +63,7 @@ public class BloomFilterLanguageModel extends DefaultNGramLanguageModel implemen /** * The logger for this class. */ - public static final Logger logger = Logger.getLogger(BloomFilterLanguageModel.class.getName()); + private static final Logger LOG = LoggerFactory.getLogger(BloomFilterLanguageModel.class); /** * The Bloom filter data structure itself. @@ -115,6 +116,7 @@ public class BloomFilterLanguageModel extends DefaultNGramLanguageModel implemen * * @param order the order of the language model * @param filename path to the file where the language model is stored + * @throws IOException if the bloom filter language model cannot be rebuilt from the input file */ public BloomFilterLanguageModel(int order, String filename) throws IOException { super(order); @@ -331,8 +333,10 @@ private static double logAdd(double x, double y) { */ public static void main(String[] argv) { if (argv.length < 5) { - System.err - .println("usage: BloomFilterLanguageModel "); + String msg = "usage: BloomFilterLanguageModel " + + " "; + System.err.println(msg); + LOG.error(msg); return; } int order = Integer.parseInt(argv[1]); @@ -346,11 +350,9 @@ public static void main(String[] argv) { new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(argv[4]))); lm.writeExternal(out); - out.close(); - } catch (FileNotFoundException e) { - System.err.println(e.getMessage()); + out.close(); //TODO: try-with-resources } catch (IOException e) { - System.err.println(e.getMessage()); + LOG.error(e.getMessage(), e); } } @@ -378,16 +380,13 @@ private void populateBloomFilter(int bloomFilterSize, String filename) { estimateStream = file_in_copy; } int numObjects = estimateNumberOfObjects(estimateStream); - System.err.println("Estimated number of objects: " + numObjects); + LOG.debug("Estimated number of objects: {}", numObjects); bf = new BloomFilter(bloomFilterSize, numObjects); countFuncs = bf.initializeHashFunctions(); populateFromInputStream(in, typesAfter); in.close(); - } catch (FileNotFoundException e) { - System.err.println(e.getMessage()); - return; } catch (IOException e) { - System.err.println(e.getMessage()); + LOG.error(e.getMessage(), e); return; } typesFuncs = bf.initializeHashFunctions(); @@ -421,7 +420,7 @@ private int estimateNumberOfObjects(InputStream source) { long cnt = Long.parseLong(toks[toks.length - 1]); if (cnt > maxCount) maxCount = cnt; } catch (NumberFormatException e) { - System.err.println("NumberFormatException! Line: " + line); + LOG.error(e.getMessage(), e); break; } numLines++; diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/package-info.java b/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/package-info.java new file mode 100644 index 00000000..19fa6952 --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/package-info.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * Provides an implementation of a bloom filter language model, and + * an associated implementation of the language model feature function typically used in + * hierarchical phrase-based decoding for statistical machine translation. + */ +package org.apache.joshua.decoder.ff.lm.bloomfilter_lm; diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java b/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java new file mode 100644 index 00000000..ccfff46d --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java @@ -0,0 +1,331 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.ff.lm.buildin_lm; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Map; +import java.util.Scanner; + +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.lm.AbstractLM; +import org.apache.joshua.decoder.ff.lm.ArpaFile; +import org.apache.joshua.decoder.ff.lm.ArpaNgram; +import org.apache.joshua.util.Bits; +import org.apache.joshua.util.Regex; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Relatively memory-compact language model + * stored as a reversed-word-order trie. + *

+ * The trie itself represents language model context. + *

+ * Conceptually, each node in the trie stores a map + * from conditioning word to log probability. + *

+ * Additionally, each node in the trie stores + * the backoff weight for that context. + * + * @author Lane Schwartz + * @see SRILM ngram-discount documentation + */ +public class TrieLM extends AbstractLM { //DefaultNGramLanguageModel { + + private static final Logger LOG = LoggerFactory.getLogger(TrieLM.class); + + /** + * Node ID for the root node. + */ + private static final int ROOT_NODE_ID = 0; + + + /** + * Maps from (node id, word id for child) --> node id of child. + */ + private final Map children; + + /** + * Maps from (node id, word id for lookup word) --> + * log prob of lookup word given context + * + * (the context is defined by where you are in the tree). + */ + private final Map logProbs; + + /** + * Maps from (node id) --> + * backoff weight for that context + * + * (the context is defined by where you are in the tree). + */ + private final Map backoffs; + + public TrieLM(Vocabulary vocab, String file) throws FileNotFoundException { + this(new ArpaFile(file,vocab)); + } + + /** + * Constructs a language model object from the specified ARPA file. + * + * @param arpaFile input ARPA file + * @throws FileNotFoundException if the input file cannot be located + */ + public TrieLM(ArpaFile arpaFile) throws FileNotFoundException { + super(arpaFile.getVocab().size(), arpaFile.getOrder()); + + int ngramCounts = arpaFile.size(); + LOG.debug("ARPA file contains {} n-grams", ngramCounts); + + this.children = new HashMap(ngramCounts); + this.logProbs = new HashMap(ngramCounts); + this.backoffs = new HashMap(ngramCounts); + + int nodeCounter = 0; + + int lineNumber = 0; + for (ArpaNgram ngram : arpaFile) { + lineNumber += 1; + if (lineNumber % 100000 == 0){ + LOG.info("Line: {}", lineNumber); + } + + LOG.debug("{}-gram: ({} | {})", ngram.order(), ngram.getWord(), + Arrays.toString(ngram.getContext())); + int word = ngram.getWord(); + + int[] context = ngram.getContext(); + + { + // Find where the log prob should be stored + int contextNodeID = ROOT_NODE_ID; + { + for (int i=context.length-1; i>=0; i--) { + long key = Bits.encodeAsLong(contextNodeID, context[i]); + int childID; + if (children.containsKey(key)) { + childID = children.get(key); + } else { + childID = ++nodeCounter; + LOG.debug("children.put({}:{}, {})", contextNodeID, context[i], childID); + children.put(key, childID); + } + contextNodeID = childID; + } + } + + // Store the log prob for this n-gram at this node in the trie + { + long key = Bits.encodeAsLong(contextNodeID, word); + float logProb = ngram.getValue(); + LOG.debug("logProbs.put({}:{}, {}", contextNodeID, word, logProb); + this.logProbs.put(key, logProb); + } + } + + { + // Find where the backoff should be stored + int backoffNodeID = ROOT_NODE_ID; + { + long backoffNodeKey = Bits.encodeAsLong(backoffNodeID, word); + int wordChildID; + if (children.containsKey(backoffNodeKey)) { + wordChildID = children.get(backoffNodeKey); + } else { + wordChildID = ++nodeCounter; + LOG.debug("children.put({}: {}, {})", backoffNodeID, word, wordChildID); + children.put(backoffNodeKey, wordChildID); + } + backoffNodeID = wordChildID; + + for (int i=context.length-1; i>=0; i--) { + long key = Bits.encodeAsLong(backoffNodeID, context[i]); + int childID; + if (children.containsKey(key)) { + childID = children.get(key); + } else { + childID = ++nodeCounter; + LOG.debug("children.put({}:{}, {})", backoffNodeID, context[i], childID); + children.put(key, childID); + } + backoffNodeID = childID; + } + } + + // Store the backoff for this n-gram at this node in the trie + { + float backoff = ngram.getBackoff(); + LOG.debug("backoffs.put({}:{}, {})", backoffNodeID, word, backoff); + this.backoffs.put(backoffNodeID, backoff); + } + } + + } + } + + + @Override + protected double logProbabilityOfBackoffState_helper( + int[] ngram, int order, int qtyAdditionalBackoffWeight + ) { + throw new UnsupportedOperationException("probabilityOfBackoffState_helper undefined for TrieLM"); + } + + @Override + protected float ngramLogProbability_helper(int[] ngram, int order) { + +// float logProb = (float) -JoshuaConfiguration.lm_ceiling_cost;//Float.NEGATIVE_INFINITY; // log(0.0f) + float backoff = 0.0f; // log(1.0f) + + int i = ngram.length - 1; + int word = ngram[i]; + i -= 1; + + int nodeID = ROOT_NODE_ID; + + while (true) { + + { + long key = Bits.encodeAsLong(nodeID, word); + if (logProbs.containsKey(key)) { +// logProb = logProbs.get(key); + backoff = 0.0f; // log(0.0f) + } + } + + if (i < 0) { + break; + } + + { + long key = Bits.encodeAsLong(nodeID, ngram[i]); + + if (children.containsKey(key)) { + nodeID = children.get(key); + + backoff += backoffs.get(nodeID); + + i -= 1; + + } else { + break; + } + } + + } + +// double result = logProb + backoff; +// if (result < -JoshuaConfiguration.lm_ceiling_cost) { +// result = -JoshuaConfiguration.lm_ceiling_cost; +// } +// +// return result; + return (Float) null; + } + + public Map getChildren() { + return this.children; + } + + public static void main(String[] args) throws IOException { + + LOG.info("Constructing ARPA file"); + ArpaFile arpaFile = new ArpaFile(args[0]); + + LOG.info("Getting symbol table"); + Vocabulary vocab = arpaFile.getVocab(); + + LOG.info("Constructing TrieLM"); + TrieLM lm = new TrieLM(arpaFile); + + int n = Integer.valueOf(args[2]); + LOG.info("N-gram order will be {}", n); + + Scanner scanner = new Scanner(new File(args[1])); + + LinkedList wordList = new LinkedList(); + LinkedList window = new LinkedList(); + + LOG.info("Starting to scan {}", args[1]); + while (scanner.hasNext()) { + + LOG.info("Getting next line..."); + String line = scanner.nextLine(); + LOG.info("Line: {}", line); + + String[] words = Regex.spaces.split(line); + wordList.clear(); + + wordList.add(""); + for (String word : words) { + wordList.add(word); + } + wordList.add(""); + + ArrayList sentence = new ArrayList(); + // int[] ids = new int[wordList.size()]; + for (int i=0, size=wordList.size(); i=n) break; + window.add(word); + i++; + } + wordList.remove(); + } + + { + int i=0; + int[] wordIDs = new int[window.size()]; + for (String word : window) { + wordIDs[i] = vocab.id(word); + i++; + } + + LOG.info("logProb {} = {}", window, lm.ngramLogProbability(wordIDs, n)); + } + } + + double logProb = lm.sentenceLogProbability(sentence, n, 2);//.ngramLogProbability(ids, n); + double prob = Math.exp(logProb); + + LOG.info("Total logProb = {}", logProb); + LOG.info("Total prob = {}", prob); + } + + } + + +} \ No newline at end of file diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java b/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java new file mode 100644 index 00000000..6c847035 --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.ff.lm.buildin_lm; \ No newline at end of file diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java b/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java new file mode 100644 index 00000000..22da71e5 --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + *

Provides abstraction and support for the language model + * feature function typically used in hierarchical phrase-based + * decoding for statistical machine translation.

+ *

The classes contained within this directory are + * responsible for two tasks: implementing the feature function, + * and representing the language model itself. The class + * `LanguageModelFF` implements the feature function by exending + * the class `DefaultStatefulFF`. One of these is instantiated + * for each language model present in the decoder.

+ *

The language models themselves are implemented as a + * combination of an interface (`NGramLanguageModel`), a default + * implementation (`DefaultNgramLangaugeModel`), and an abstract + * implementation of the default (`AbstractLM`).

+ * + *
+ *  DefaultStatefulFF
+ *  |- LanguageModelFF
+ *
+ *  DefaultNgramLanguageModel implements interface NGramLanguageModel
+ *  |- AbstractLM
+ * 
+ */ +package org.apache.joshua.decoder.ff.lm; diff --git a/src/main/java/org/apache/joshua/decoder/ff/package-info.java b/src/main/java/org/apache/joshua/decoder/ff/package-info.java new file mode 100644 index 00000000..b0af73e6 --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/package-info.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + *

Provides an implementation of the linear feature functions + * typically used in hierarchical phrase-based decoding for + * statistical machine translation.

+ *

The following is a note from Juri describing some of the + * functionality of the feature functions interfaces and default + * abstract classes.

+ *
+ * The equality that I intended for is ff.transitionLogP() =
+ * ff.estimateLogP() + ff.reEstimateTransitionLogP(). The re-estimate
+ * fixes the estimate to be the true transition cost that takes into
+ * account the state. Before decoding the cost of applying a rule is
+ * estimated via estimateLogP() and yields the phrasal feature costs plus
+ * an LM estimate of the cost of the lexical portions of the rule.
+ * transitionLogP() takes rule and state and computes everything from
+ * scratch, whereas reEstimateTransitionLogP() adds in the cost of new
+ * n-grams that result from combining the rule with the LM states and
+ * subtracts out the cost of superfluous less-than-n-grams that were
+ * overridden by the updated cost calculation.
+ * 
+ * Hope this helps.
+ * 
+ */ +package org.apache.joshua.decoder.ff; diff --git a/src/joshua/decoder/ff/phrase/Distortion.java b/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java similarity index 67% rename from src/joshua/decoder/ff/phrase/Distortion.java rename to src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java index 15aced8f..abe115ca 100644 --- a/src/joshua/decoder/ff/phrase/Distortion.java +++ b/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java @@ -16,20 +16,20 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.phrase; +package org.apache.joshua.decoder.ff.phrase; import java.util.ArrayList; import java.util.List; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.StatelessFF; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.phrase.Hypothesis; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.StatelessFF; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.phrase.Hypothesis; +import org.apache.joshua.decoder.segment_file.Sentence; public class Distortion extends StatelessFF { @@ -37,9 +37,9 @@ public Distortion(FeatureVector weights, String[] args, JoshuaConfiguration conf super(weights, "Distortion", args, config); if (! config.search_algorithm.equals("stack")) { - System.err.println("* FATAL: Distortion feature only application for phrase-based decoding"); - System.err.println(" Use -search phrase or remove this feature"); - System.exit(1); + String msg = "* FATAL: Distortion feature only application for phrase-based decoding. " + + "Use -search phrase or remove this feature"; + throw new RuntimeException(msg); } } @@ -68,4 +68,16 @@ public DPState compute(Rule rule, List tailNodes, int i, int j, SourcePa return null; } + + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } } diff --git a/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java b/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java similarity index 88% rename from src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java rename to src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java index 3497001e..476ecac7 100644 --- a/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java +++ b/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.similarity; +package org.apache.joshua.decoder.ff.similarity; import java.io.BufferedReader; import java.io.IOException; @@ -30,21 +30,25 @@ import com.google.common.base.Throwables; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.StatefulFF; -import joshua.decoder.ff.SourceDependentFF; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.state_maintenance.NgramDPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.segment_file.Sentence; -import joshua.util.Cache; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.StatefulFF; +import org.apache.joshua.decoder.ff.SourceDependentFF; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.joshua.util.Cache; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class EdgePhraseSimilarityFF extends StatefulFF implements SourceDependentFF { + private static final Logger LOG = LoggerFactory.getLogger(EdgePhraseSimilarityFF.class); + private static Cache cache = new Cache(100000000); private String host; @@ -68,9 +72,8 @@ public EdgePhraseSimilarityFF(FeatureVector weights, String[] args, JoshuaConfig initializeConnection(); } - private void initializeConnection() throws NumberFormatException, UnknownHostException, - IOException { - System.err.println("Opening connection."); + private void initializeConnection() throws NumberFormatException, IOException { + LOG.info("Opening connection."); socket = new Socket(host, port); serverAsk = new PrintWriter(socket.getOutputStream(), true); serverReply = new BufferedReader(new InputStreamReader(socket.getInputStream())); @@ -274,4 +277,16 @@ private float getSimilarity(List batch) { return (count == 0 ? 0 : similarity / count); } + @Override + public double estimateLogP(Rule rule, int sentID) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public double getWeight() { + // TODO Auto-generated method stub + return 0; + } + } diff --git a/src/joshua/decoder/ff/state_maintenance/DPState.java b/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java similarity index 87% rename from src/joshua/decoder/ff/state_maintenance/DPState.java rename to src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java index 1a02a90e..e117fde9 100644 --- a/src/joshua/decoder/ff/state_maintenance/DPState.java +++ b/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java @@ -16,13 +16,13 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.state_maintenance; +package org.apache.joshua.decoder.ff.state_maintenance; /** * Abstract class enforcing explicit implementation of the standard methods. * - * @author Zhifei Li, - * @author Juri Ganitkevitch, + * @author Zhifei Li, zhifei.work@gmail.com + * @author Juri Ganitkevitch, juri@cs.jhu.edu */ public abstract class DPState { diff --git a/src/joshua/decoder/ff/state_maintenance/KenLMState.java b/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java similarity index 91% rename from src/joshua/decoder/ff/state_maintenance/KenLMState.java rename to src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java index 906f8d88..4fdc6316 100644 --- a/src/joshua/decoder/ff/state_maintenance/KenLMState.java +++ b/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java @@ -16,13 +16,13 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.state_maintenance; +package org.apache.joshua.decoder.ff.state_maintenance; /** * Maintains a state pointer used by KenLM to implement left-state minimization. * - * @author Matt Post - * @author Juri Ganitkevitch + * @author Matt Post post@cs.jhu.edu + * @author Juri Ganitkevitch juri@cs.jhu.edu */ public class KenLMState extends DPState { diff --git a/src/joshua/decoder/ff/state_maintenance/NgramDPState.java b/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java similarity index 93% rename from src/joshua/decoder/ff/state_maintenance/NgramDPState.java rename to src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java index b72a5ba4..b269bd91 100644 --- a/src/joshua/decoder/ff/state_maintenance/NgramDPState.java +++ b/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java @@ -16,15 +16,15 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.state_maintenance; +package org.apache.joshua.decoder.ff.state_maintenance; import java.util.Arrays; -import joshua.corpus.Vocabulary; +import org.apache.joshua.corpus.Vocabulary; /** - * @author Zhifei Li, - * @author Juri Ganitkevitch, + * @author Zhifei Li, zhifei.work@gmail.com + * @author Juri Ganitkevitch, juri@cs.jhu.edu */ public class NgramDPState extends DPState { diff --git a/src/joshua/decoder/ff/tm/AbstractGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java similarity index 80% rename from src/joshua/decoder/ff/tm/AbstractGrammar.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java index 8cfb2ad6..5a5d02bc 100644 --- a/src/joshua/decoder/ff/tm/AbstractGrammar.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java @@ -16,22 +16,23 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm; +package org.apache.joshua.decoder.ff.tm; -import java.util.ArrayList; -import java.util.Arrays; import java.util.HashSet; import java.util.List; -import java.util.logging.Level; -import java.util.logging.Logger; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.segment_file.Token; -import joshua.lattice.Arc; -import joshua.lattice.Lattice; -import joshua.lattice.Node; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.phrase.PhraseTable; +import org.apache.joshua.decoder.segment_file.Token; +import org.apache.joshua.lattice.Arc; +import org.apache.joshua.lattice.Lattice; +import org.apache.joshua.lattice.Node; + +import cern.colt.Arrays; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Partial implementation of the Grammar interface that provides logic for sorting a @@ -43,13 +44,12 @@ * * @author Zhifei Li * @author Lane Schwartz - * @author Matt Post models) { Trie root = getTrieRoot(); @@ -126,17 +128,17 @@ public boolean isSorted() { /** * Sets the flag indicating whether this grammar is sorted. *

- * This method is called by {@link #sortGrammar(ArrayList)} to indicate that the grammar has been - * sorted. + * This method is called by {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar#sortGrammar(List)} + * to indicate that the grammar has been sorted.

* - * Its scope is protected so that child classes that override sortGrammar will also - * be able to call this method to indicate that the grammar has been sorted. + *

Its scope is protected so that child classes that override sortGrammar will also + * be able to call this method to indicate that the grammar has been sorted.

* - * @param sorted + * @param sorted set to true if the grammar is sorted */ protected void setSorted(boolean sorted) { this.sorted = sorted; - logger.fine("This grammar is now sorted: " + this); + LOG.debug("This grammar is now sorted: {}", this); } /** @@ -153,13 +155,12 @@ private void sort(Trie node, List models) { if (node != null) { if (node.hasRules()) { RuleCollection rules = node.getRuleCollection(); - if (logger.isLoggable(Level.FINE)) - logger.fine("Sorting node " + Arrays.toString(rules.getSourceSide())); + LOG.debug("Sorting node {}", Arrays.toString(rules.getSourceSide())); /* This causes the rules at this trie node to be sorted */ rules.getSortedRules(models); - if (logger.isLoggable(Level.FINEST)) { + if (LOG.isDebugEnabled()) { StringBuilder s = new StringBuilder(); for (Rule r : rules.getSortedRules(models)) { s.append("\n\t" + r.getLHS() + " ||| " + Arrays.toString(r.getFrench()) + " ||| " @@ -167,7 +168,7 @@ private void sort(Trie node, List models) { + r.getEstimatedCost() + " " + r.getClass().getName() + "@" + Integer.toHexString(System.identityHashCode(r))); } - logger.finest(s.toString()); + LOG.debug("{}", s); } } @@ -175,8 +176,8 @@ private void sort(Trie node, List models) { for (Trie child : node.getExtensions()) { sort(child, models); } - } else if (logger.isLoggable(Level.FINE)) { - logger.fine("Node has 0 children to extend: " + node); + } else { + LOG.debug("Node has 0 children to extend: {}", node); } } } @@ -189,8 +190,10 @@ public void writeGrammarOnDisk(String file) { * Adds OOV rules for all words in the input lattice to the current grammar. Uses addOOVRule() so that * sub-grammars can define different types of OOV rules if needed (as is used in {@link PhraseTable}). * + * @param grammar Grammar in the Trie * @param inputLattice the lattice representing the input sentence * @param featureFunctions a list of feature functions used for scoring + * @param onlyTrue determine if word is actual OOV. */ public static void addOOVRules(Grammar grammar, Lattice inputLattice, List featureFunctions, boolean onlyTrue) { diff --git a/src/joshua/decoder/ff/tm/BasicRuleCollection.java b/src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java similarity index 96% rename from src/joshua/decoder/ff/tm/BasicRuleCollection.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java index 6dda7f71..4cffb2fd 100644 --- a/src/joshua/decoder/ff/tm/BasicRuleCollection.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java @@ -16,13 +16,13 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm; +package org.apache.joshua.decoder.ff.tm; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureFunction; /** * Basic collection of translation rules. diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/BilingualRule.java b/src/main/java/org/apache/joshua/decoder/ff/tm/BilingualRule.java new file mode 100644 index 00000000..b2299baf --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/BilingualRule.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.ff.tm; + +import java.util.Arrays; +import java.util.Map; + +import org.apache.joshua.corpus.SymbolTable; + + +/** + * Normally, the feature score in the rule should be *cost* (i.e., + * -LogP), so that the feature weight should be positive + * + * @author Zhifei Li, zhifei.work@gmail.com + * @version $LastChangedDate: 2010-01-20 19:46:54 -0600 (Wed, 20 Jan 2010) $ + */ +public class BilingualRule extends MonolingualRule { + + private int[] english; + + //=============================================================== + // Constructors + //=============================================================== + + /** + * Constructs a new rule using the provided parameters. The + * owner and rule id for this rule are undefined. + * + * @param lhs Left-hand side of the rule. + * @param sourceRhs Source language right-hand side of the rule. + * @param targetRhs Target language right-hand side of the rule. + * @param featureScores Feature value scores for the rule. + * @param arity Number of nonterminals in the source language + * right-hand side. + * @param owner todo + * @param latticeCost todo + * @param ruleID todo + */ + public BilingualRule(int lhs, int[] sourceRhs, int[] targetRhs, float[] featureScores, int arity, int owner, float latticeCost, int ruleID) { + super(lhs, sourceRhs, featureScores, arity, owner, latticeCost, ruleID); + this.english = targetRhs; + } + + //called by class who does not care about lattice_cost, rule_id, and owner + public BilingualRule(int lhs, int[] sourceRhs, int[] targetRhs, float[] featureScores, int arity) { + super(lhs, sourceRhs, featureScores, arity); + this.english = targetRhs; + } + + + //=============================================================== + // Attributes + //=============================================================== + + public final void setEnglish(int[] eng) { + this.english = eng; + } + + public final int[] getEnglish() { + return this.english; + } + + + //=============================================================== + // Serialization Methods + //=============================================================== + // TODO: remove these methods + + // Caching this method significantly improves performance + // We mark it transient because it is, though cf java.io.Serializable + private transient String cachedToString = null; + + public String toString(Map ntVocab, SymbolTable sourceVocab, SymbolTable targetVocab) { + if (null == this.cachedToString) { + StringBuffer sb = new StringBuffer("["); + sb.append(ntVocab.get(this.getLHS())); + sb.append("] ||| "); + sb.append(sourceVocab.getWords(this.getFrench(),true)); + sb.append(" ||| "); + sb.append(targetVocab.getWords(this.english,false)); + //sb.append(java.util.Arrays.toString(this.english)); + sb.append(" |||"); + for (int i = 0; i < this.getFeatureScores().length; i++) { + // sb.append(String.format(" %.12f", this.getFeatureScores()[i])); + sb.append(' '); + sb.append(Float.toString(this.getFeatureScores()[i])); + } + this.cachedToString = sb.toString(); + } + return this.cachedToString; + } + + + //print the rule in terms of Integers + public String toString() { + if (null == this.cachedToString) { + StringBuffer sb = new StringBuffer(); + sb.append(this.getClass().getName() + "@" + Integer.toHexString(System.identityHashCode(this))); + sb.append("~~~"); + sb.append(this.getLHS()); + sb.append(" ||| "); + sb.append(Arrays.toString(this.getFrench())); + sb.append(" ||| "); + sb.append(Arrays.toString(this.english)); + sb.append(" |||"); + for (int i = 0; i < this.getFeatureScores().length; i++) { + sb.append(String.format(" %.4f", this.getFeatureScores()[i])); + } + this.cachedToString = sb.toString(); + } + return this.cachedToString; + } + + + public String toString(SymbolTable symbolTable) { + if (null == this.cachedToString) { + StringBuffer sb = new StringBuffer(); + sb.append(symbolTable.getWord(this.getLHS())); + sb.append(" ||| "); + sb.append(symbolTable.getWords(this.getFrench())); + sb.append(" ||| "); + sb.append(symbolTable.getWords(this.english)); + sb.append(" |||"); + for (int i = 0; i < this.getFeatureScores().length; i++) { + sb.append(String.format(" %.4f", this.getFeatureScores()[i])); + } + this.cachedToString = sb.toString(); + } + return this.cachedToString; + } + + public String toStringWithoutFeatScores(SymbolTable symbolTable) { + StringBuffer sb = new StringBuffer(); + if(symbolTable==null) + sb.append(this.getLHS()); + else + sb.append(symbolTable.getWord(this.getLHS())); + + return sb.append(" ||| ") + .append(convertToString(this.getFrench(), symbolTable)) + .append(" ||| ") + .append(convertToString(this.getEnglish(), symbolTable)) + .toString(); + } + + + + + +} \ No newline at end of file diff --git a/src/joshua/decoder/ff/tm/CreateGlueGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java similarity index 84% rename from src/joshua/decoder/ff/tm/CreateGlueGrammar.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java index 51e9fc30..ce1e7d14 100644 --- a/src/joshua/decoder/ff/tm/CreateGlueGrammar.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java @@ -16,33 +16,34 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm; +package org.apache.joshua.decoder.ff.tm; -import static joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME; -import static joshua.util.FormatUtils.cleanNonTerminal; -import static joshua.util.FormatUtils.isNonterminal; +import static org.apache.joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME; +import static org.apache.joshua.util.FormatUtils.cleanNonTerminal; +import static org.apache.joshua.util.FormatUtils.isNonterminal; import java.io.File; import java.io.IOException; import java.util.HashSet; import java.util.Set; -import java.util.logging.Logger; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.util.io.LineReader; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class CreateGlueGrammar { - - + + + private static final Logger LOG = LoggerFactory.getLogger(CreateGlueGrammar.class); + private final Set nonTerminalSymbols = new HashSet<>(); - private static final Logger log = Logger.getLogger(CreateGlueGrammar.class.getName()); - + @Option(name = "--grammar", aliases = {"-g"}, required = true, usage = "provide grammar to determine list of NonTerminal symbols.") private String grammarPath; @@ -84,7 +85,7 @@ private void run() throws IOException { int lhsStart = line.indexOf("[") + 1; int lhsEnd = line.indexOf("]"); if (lhsStart < 1 || lhsEnd < 0) { - log.info(String.format("malformed rule: %s\n", line)); + LOG.info("malformed rule: {}\n", line); continue; } final String lhs = line.substring(lhsStart, lhsEnd); @@ -92,10 +93,8 @@ private void run() throws IOException { } } - log.info( - String.format("%d nonTerminal symbols read: %s", - nonTerminalSymbols.size(), - nonTerminalSymbols.toString())); + LOG.info("{} nonTerminal symbols read: {}", nonTerminalSymbols.size(), + nonTerminalSymbols.toString()); // write glue rules to stdout @@ -119,7 +118,7 @@ public static void main(String[] args) throws IOException { parser.parseArgument(args); glueCreator.run(); } catch (CmdLineException e) { - log.info(e.toString()); + LOG.error(e.getMessage(), e); parser.printUsage(System.err); System.exit(1); } diff --git a/src/joshua/decoder/ff/tm/Grammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java similarity index 83% rename from src/joshua/decoder/ff/tm/Grammar.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java index a8344423..9748ba07 100644 --- a/src/joshua/decoder/ff/tm/Grammar.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm; +package org.apache.joshua.decoder.ff.tm; import java.util.List; -import joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureFunction; /** * Grammar is a class for wrapping a trie of TrieGrammar in order to store holistic metadata. * - * @author wren ng thornton - * @author Zhifei Li, + * @author wren ng thornton wren@users.sourceforge.net + * @author Zhifei Li, zhifei.work@gmail.com */ public interface Grammar { @@ -45,7 +45,7 @@ public interface Grammar { *

* Cube-pruning requires that the grammar be sorted based on the latest feature functions. * - * @param weights The model weights. + * @param models list of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s */ void sortGrammar(List models); @@ -73,6 +73,7 @@ public interface Grammar { * @param pathLength Length of the input path in a source input lattice. If a source input phrase * is used instead of a lattice, this value will likely be ignored by the underlying * implementation, but would normally be defined as endIndex-startIndex + * @return true if there is a rule for this span */ boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength); @@ -93,6 +94,12 @@ public interface Grammar { /** * This is used to construct a manual rule supported from outside the grammar, but the owner * should be the same as the grammar. Rule ID will the same as OOVRuleId, and no lattice cost + * @param lhs todo + * @param sourceWords todo + * @param targetWords todo + * @param scores todo + * @param arity todo + * @return the constructed {@link org.apache.joshua.decoder.ff.tm.Rule} */ @Deprecated Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores, int arity); @@ -100,7 +107,7 @@ public interface Grammar { /** * Dump the grammar to disk. * - * @param file + * @param file the file path to write to */ @Deprecated void writeGrammarOnDisk(String file); @@ -115,26 +122,28 @@ public interface Grammar { /** * Return the grammar's owner. + * @return grammar owner */ int getOwner(); /** - * Return the maximum source phrase length (terminals + nonterminals). + * Return the maximum source phrase length (terminals + nonterminals) + * @return the maximum source phrase length */ int getMaxSourcePhraseLength(); /** * Add an OOV rule for the requested word for the grammar. * - * @param word - * @param featureFunctions + * @param word input word to add rules to + * @param featureFunctions a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s */ void addOOVRules(int word, List featureFunctions); /** * Add a rule to the grammar. * - * @param Rule the rule + * @param rule the {@link org.apache.joshua.decoder.ff.tm.Rule} */ void addRule(Rule rule); } diff --git a/src/joshua/decoder/ff/tm/GrammarReader.java b/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java similarity index 84% rename from src/joshua/decoder/ff/tm/GrammarReader.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java index f94a472b..2bdc9ac1 100644 --- a/src/joshua/decoder/ff/tm/GrammarReader.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java @@ -16,16 +16,16 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm; +package org.apache.joshua.decoder.ff.tm; import java.io.IOException; import java.util.Iterator; -import java.util.logging.Level; -import java.util.logging.Logger; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.util.io.LineReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This is a base class for simple, ASCII line-based grammars that are stored on disk. @@ -35,6 +35,8 @@ */ public abstract class GrammarReader implements Iterable, Iterator { + private static final Logger LOG = LoggerFactory.getLogger(GrammarReader.class); + protected static String fieldDelimiter; protected static String nonTerminalRegEx; protected static String nonTerminalCleanRegEx; @@ -46,7 +48,6 @@ public abstract class GrammarReader implements Iterable, Iter protected String lookAhead; protected int numRulesRead; - private static final Logger logger = Logger.getLogger(GrammarReader.class.getName()); // dummy constructor for public GrammarReader() { @@ -65,7 +66,7 @@ public void initialize() { + (null != e.getMessage() ? e.getMessage() : "No details available. Sorry."), e); } - Decoder.LOG(1, String.format("Reading grammar from file %s...", fileName)); + LOG.info("Reading grammar from file {}...", fileName); numRulesRead = 0; advanceReader(); } @@ -85,9 +86,8 @@ public void close() { try { this.reader.close(); } catch (IOException e) { - // FIXME: is this the right logging level? - if (logger.isLoggable(Level.WARNING)) - logger.info("Error closing grammar file stream: " + this.fileName); + LOG.warn(e.getMessage(), e); + LOG.error("Error closing grammar file stream: {}", this.fileName); } this.reader = null; } @@ -97,13 +97,13 @@ public void close() { * For correct behavior close must be called on every GrammarReader, however this * code attempts to avoid resource leaks. * - * @see joshua.util.io.LineReader + * @see org.apache.joshua.util.io.LineReader */ @Override protected void finalize() throws Throwable { if (this.reader != null) { - logger.severe("Grammar file stream was not closed, this indicates a coding error: " - + this.fileName); + LOG.error("Grammar file stream was not closed, this indicates a coding error: {}", + this.fileName); } this.close(); @@ -120,7 +120,8 @@ private void advanceReader() { lookAhead = reader.readLine(); numRulesRead++; } catch (IOException e) { - logger.severe("Error reading grammar from file: " + fileName); + LOG.error("Error reading grammar from file: {}", fileName); + LOG.error(e.getMessage(), e); } if (lookAhead == null && reader != null) { this.close(); @@ -136,10 +137,12 @@ public R next() { int oldProgress = reader.progress(); advanceReader(); - + + if (Decoder.VERBOSE >= 1) { int newProgress = (reader != null) ? reader.progress() : 100; + //TODO: review this code. It is better to print progress based on time gap (like for every 1s or 2sec) than %! if (newProgress > oldProgress) { for (int i = oldProgress + 1; i <= newProgress; i++) if (i == 97) { @@ -173,7 +176,7 @@ public R next() { /** * Removes square brackets (and index, if present) from nonterminal id - * @param tokenID + * @param tokenID the int ID to clean * @return cleaned ID */ public static int cleanNonTerminal(int tokenID) { @@ -183,7 +186,7 @@ public static int cleanNonTerminal(int tokenID) { /** * Removes square brackets (and index, if present) from nonterminal id - * @param token + * @param token the string ID to clean * @return cleaned token */ public static String cleanNonTerminal(String token) { diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/MonolingualRule.java b/src/main/java/org/apache/joshua/decoder/ff/tm/MonolingualRule.java new file mode 100644 index 00000000..26cd50c2 --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/MonolingualRule.java @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.ff.tm; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.joshua.corpus.SymbolTable; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * this class implements MonolingualRule + * + * @author Zhifei Li, zhifei.work@gmail.com + * @version $LastChangedDate: 2010-02-10 09:59:38 -0600 (Wed, 10 Feb 2010) $ + */ +public class MonolingualRule extends Rule { + + private static final Logger LOG = LoggerFactory.getLogger(MonolingualRule.class); + + //=============================================================== + // Instance Fields + //=============================================================== + + /* The string format of Rule is: + * [Phrase] ||| french ||| english ||| feature scores + */ + private int ruleID; + private int lhs; // tag of this rule + private int[] pFrench; //pointer to the RuleCollection, as all the rules under it share the same Source side + private int arity; + private float[] featScores; // the feature scores for this rule + + /* a feature function will be fired for this rule + * only if the owner of the rule matches the owner of the feature function + */ + private int owner; + + // TODO: consider remove this from the general class, and + // create a new specific Rule class + private float latticeCost; + + /** + * estimate_cost depends on rule itself: statelesscost + + * transition_cost(non-stateless/non-contexual* models), + * we need this variable in order to provide sorting for + * cube-pruning + */ + private float est_cost = 0; + + //=============================================================== + // Static Fields + //=============================================================== + + // TODO: Ideally, we shouldn't have to have dummy rule IDs + // and dummy owners. How can this need be eliminated? + public static final int DUMMY_RULE_ID = 1; + public static final int DUMMY_OWNER = 1; + + + //=============================================================== + // Constructors + //=============================================================== + + /** + * Constructs a new rule using the provided parameters. The + * owner and rule id for this rule are undefined. + * + * @param lhs Left-hand side of the rule. + * @param sourceRhs Source language right-hand side of the rule. + * @param featureScores Feature value scores for the rule. + * @param arity Number of nonterminals in the source language + * right-hand side. + * @param owner todo + * @param latticeCost todo + * @param ruleID todo + */ + public MonolingualRule(int lhs, int[] sourceRhs, float[] featureScores, int arity, int owner, float latticeCost, int ruleID) { + this.lhs = lhs; + this.pFrench = sourceRhs; + this.featScores = featureScores; + this.arity = arity; + this.latticeCost = latticeCost; + this.ruleID = ruleID; + this.owner = owner; + } + + + // called by class who does not care about lattice_cost, + // rule_id, and owner + public MonolingualRule(int lhs_, int[] source_rhs, float[] feature_scores, int arity_) { + this.lhs = lhs_; + this.pFrench = source_rhs; + this.featScores = feature_scores; + this.arity = arity_; + + //==== dummy values + this.latticeCost = 0; + this.ruleID = DUMMY_RULE_ID; + this.owner = DUMMY_OWNER; + } + + + //=============================================================== + // Attributes + //=============================================================== + + public final void setRuleID(int id) { this.ruleID = id; } + + public final int getRuleID() { return this.ruleID; } + + + public final void setArity(int arity) { this.arity = arity; } + + public final int getArity() { return this.arity; } + + + public final void setOwner(int owner) { this.owner = owner; } + + public final int getOwner() { return this.owner; } + + + public final void setLHS(int lhs) { this.lhs = lhs; } + + public final int getLHS() { return this.lhs; } + + + public void setEnglish(int[] eng) { + //TODO: do nothing + } + + public int[] getEnglish() { + //TODO + return null; + } + + + public final void setFrench(int[] french) { this.pFrench = french; } + + public final int[] getFrench() { return this.pFrench; } + + + public final void setFeatureScores(float[] scores) { + this.featScores = scores; + } + + public final float[] getFeatureScores() { + return this.featScores; + } + + + public final void setLatticeCost(float cost) { this.latticeCost = cost; } + + public final float getLatticeCost() { return this.latticeCost; } + + + public final float getEstCost() { + if (est_cost <= Double.NEGATIVE_INFINITY) { + LOG.warn("The est cost is neg infinity; must be bad rule; rule is:\n {}", this); + } + return est_cost; + } + + + /** + * Set a lower-bound estimate inside the rule returns full + * estimate. + */ + public final float estimateRuleCost(List featureFunctions) { + if (null == featureFunctions) { + return 0; + } else { + float estcost = 0.0f; + for (FeatureFunction ff : featureFunctions) { + double mdcost = - ff.estimateLogP(this, -1) * ff.getWeight(); + estcost += mdcost; + } + + this.est_cost = estcost; + return estcost; + } + } + + //=============================================================== + // Methods + //=============================================================== + + public float incrementFeatureScore(int column, double score) { + synchronized(this) { + featScores[column] += score; + return featScores[column]; + } + } + + + public void setFeatureCost(int column, float score) { + synchronized(this) { + featScores[column] = score; + } + } + + + public float getFeatureCost(int column) { + synchronized(this) { + return featScores[column]; + } + } + + //=============================================================== + // Serialization Methods + //=============================================================== + // BUG: These are all far too redundant. Should be refactored to share. + + // Caching this method significantly improves performance + // We mark it transient because it is, though cf + // java.io.Serializable + private transient String cachedToString = null; + + @Deprecated + public String toString(Map ntVocab, SymbolTable sourceVocab, SymbolTable targetVocab) { + if (null == this.cachedToString) { + StringBuffer sb = new StringBuffer(); + sb.append(ntVocab.get(this.lhs)); + sb.append(" ||| "); + sb.append(sourceVocab.getWords(this.pFrench,true)); + sb.append(" |||"); + for (int i = 0; i < this.featScores.length; i++) { + //sb.append(String.format(" %.4f", this.feat_scores[i])); + sb.append(' ').append(Float.toString(this.featScores[i])); + } + this.cachedToString = sb.toString(); + } + return this.cachedToString; + } + + + //print the rule in terms of Ingeters + @Deprecated + public String toString() { + if (null == this.cachedToString) { + StringBuffer sb = new StringBuffer(); + sb.append(this.lhs); + sb.append(" ||| "); + sb.append(Arrays.toString(this.pFrench)); + sb.append(" |||"); + for (int i = 0; i < this.featScores.length; i++) { + sb.append(String.format(" %.4f", this.featScores[i])); + } + this.cachedToString = sb.toString(); + } + return this.cachedToString; + } + + + //do not use cachedToString + @Deprecated + public String toString(SymbolTable symbolTable) { + StringBuffer sb = new StringBuffer(); + sb.append(symbolTable.getWord(this.lhs)); + sb.append(" ||| "); + sb.append(symbolTable.getWords(this.pFrench)); + sb.append(" |||"); + for (int i = 0; i < this.featScores.length; i++) { + sb.append(String.format(" %.4f", this.featScores[i])); + } + return sb.toString(); + } + + + @Deprecated + public String toStringWithoutFeatScores(SymbolTable symbolTable) { + StringBuffer sb = new StringBuffer(); + if(symbolTable==null) + sb.append(this.getLHS()); + else + sb.append(symbolTable.getWord(this.getLHS())); + + return sb.append(" ||| ") + .append(convertToString(this.getFrench(), symbolTable)) + .toString(); + } + + public String convertToString(int[] words, SymbolTable symbolTable){ + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < words.length; i++) { + if(symbolTable!=null) + sb.append( symbolTable.getWord(words[i]) ); + else + sb.append(words[i]); + + if(iA class for reading in rules from a Moses phrase table. Most of the conversion work is done + * in {@link org.apache.joshua.decoder.ff.tm.format.PhraseFormatReader}. This includes prepending every * rule with a nonterminal, so that the phrase-based decoder can assume the same hypergraph * format as the hierarchical decoder (by pretending to be a strictly left-branching grammar and * dispensing with the notion of coverage spans). However, prepending the nonterminals means all * the alignments are off by 1. We do not want to fix those when reading in due to the expense, - * so instead we use this rule which adjust the alignments on the fly. + * so instead we use this rule which adjust the alignments on the fly.

* - * Also, we only convert the Moses dense features on the fly, via this class. + *

Also, we only convert the Moses dense features on the fly, via this class.

* - * TODO: this class should also be responsible for prepending the nonterminals. + *

TODO: this class should also be responsible for prepending the nonterminals.

* * @author Matt Post * diff --git a/src/joshua/decoder/ff/tm/Rule.java b/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java similarity index 92% rename from src/joshua/decoder/ff/tm/Rule.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java index 9f1fb8fe..255324aa 100644 --- a/src/joshua/decoder/ff/tm/Rule.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm; +package org.apache.joshua.decoder.ff.tm; import java.util.ArrayList; import java.util.Arrays; @@ -29,11 +29,13 @@ import com.google.common.base.Supplier; import com.google.common.base.Suppliers; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class define the interface for Rule. @@ -42,19 +44,15 @@ * Note that not all features need to be negative log probs, but you should be aware that they * will be negated, so if you want a positive count, it should come in as negative. * - * @author Zhifei Li, - */ - - -/** * Normally, the feature score in the rule should be *cost* (i.e., -LogP), so that the feature * weight should be positive * - * @author Zhifei Li, - * @author Matt Post + * @author Zhifei Li, zhifei.work@gmail.com + * @author Matt Post post@cs.jhu.edu */ public class Rule implements Comparator, Comparable { + private static final Logger LOG = LoggerFactory.getLogger(Rule.class); private int lhs; // tag of this rule private int[] pFrench; // pointer to the RuleCollection, as all the rules under it share the same // Source side @@ -100,7 +98,7 @@ public class Rule implements Comparator, Comparable { * @param targetRhs Target language right-hand side of the rule. * @param sparseFeatures Feature value scores for the rule. * @param arity Number of nonterminals in the source language right-hand side. - * @param owner + * @param owner todo */ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, int owner) { this.lhs = lhs; @@ -114,7 +112,13 @@ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, in } /** - * Constructor used by PackedGrammar's sortRules(). + * Constructor used by PackedGrammar's sortRules() + * @param lhs todo + * @param sourceRhs todo + * @param targetRhs todo + * @param features todo + * @param arity todo + * @param owner todo */ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, int owner) { this.lhs = lhs; @@ -130,6 +134,11 @@ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, i /** * Constructor used for SamtFormatReader and GrammarBuilderWalkerFunction's getRuleWithSpans() * Owner set to -1 + * @param lhs todo + * @param sourceRhs todo + * @param targetRhs todo + * @param sparseFeatures todo + * @param arity todo */ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity) { this(lhs, sourceRhs, targetRhs, sparseFeatures, arity, -1); @@ -137,6 +146,12 @@ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, in /** * Constructor used for addOOVRules(), HieroFormatReader and PhraseRule. + * @param lhs todo + * @param sourceRhs todo + * @param targetRhs todo + * @param sparseFeatures todo + * @param arity todo + * @param alignment todo */ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, String alignment) { this(lhs, sourceRhs, targetRhs, sparseFeatures, arity); @@ -283,6 +298,8 @@ public int[] getFrench() { * This function returns the dense (phrasal) features discovered when the rule was loaded. Dense * features are the list of unlabeled features that preceded labeled ones. They can also be * specified as labeled features of the form "tm_OWNER_INDEX", but the former format is preferred. + * + * @return the {@link org.apache.joshua.decoder.ff.FeatureVector} for this rule */ public FeatureVector getFeatureVector() { return featuresSupplier.get(); @@ -355,12 +372,10 @@ public float estimateRuleCost(List models) { if (this.estimatedCost <= Float.NEGATIVE_INFINITY) { this.estimatedCost = 0.0f; // weights.innerProduct(computeFeatures()); - if (Decoder.VERBOSE >= 4) - System.err.println(String.format("estimateCost(%s ;; %s)", getFrenchWords(), getEnglishWords())); + LOG.debug("estimateCost({} ;; {})", getFrenchWords(), getEnglishWords()); for (FeatureFunction ff : models) { float val = ff.estimateCost(this, null); - if (Decoder.VERBOSE >= 4) - System.err.println(String.format(" FEATURE %s -> %.3f", ff.getName(), val)); + LOG.debug(" FEATURE {} -> {}", ff.getName(), val); this.estimatedCost += val; } } @@ -389,7 +404,7 @@ public String toString() { /** * Returns a version of the rule suitable for reading in from a text file. * - * @return + * @return string version of the rule */ public String textFormat() { StringBuffer sb = new StringBuffer(); @@ -425,6 +440,8 @@ public String getFeatureString() { /** * Returns an alignment as a sequence of integers. The integers at positions i and i+1 are paired, * with position i indexing the source and i+1 the target. + * + * @return a byte[] from the {@link com.google.common.base.Supplier} */ public byte[] getAlignment() { return this.alignmentSupplier.get(); @@ -468,7 +485,7 @@ public boolean isTerminal() { /** * Return the French (source) nonterminals as list of Strings * - * @return + * @return a list of strings */ public int[] getForeignNonTerminals() { int[] nts = new int[getArity()]; @@ -481,6 +498,8 @@ public int[] getForeignNonTerminals() { /** * Returns an array of size getArity() containing the source indeces of non terminals. + * + * @return an array of size getArity() containing the source indeces of non terminals */ public int[] getNonTerminalSourcePositions() { int[] nonTerminalPositions = new int[getArity()]; @@ -495,6 +514,8 @@ public int[] getNonTerminalSourcePositions() { /** * Parses the Alignment byte[] into a Map from target to (possibly a list of) source positions. * Used by the WordAlignmentExtractor. + * + * @return a {@link java.util.Map} of alignments */ public Map> getAlignmentMap() { byte[] alignmentArray = getAlignment(); @@ -515,7 +536,7 @@ public Map> getAlignmentMap() { /** * Return the English (target) nonterminals as list of Strings * - * @return + * @return list of strings */ public int[] getEnglishNonTerminals() { int[] nts = new int[getArity()]; @@ -570,8 +591,8 @@ private Pattern getPattern() { /** * Matches the string representation of the rule's source side against a sentence * - * @param sentence - * @return + * @param sentence {@link org.apache.joshua.lattice.Lattice} input + * @return true if there is a match */ public boolean matches(Sentence sentence) { boolean match = getPattern().matcher(sentence.fullSource()).find(); diff --git a/src/joshua/decoder/ff/tm/RuleCollection.java b/src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java similarity index 83% rename from src/joshua/decoder/ff/tm/RuleCollection.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java index 6812fd57..a45c41b2 100644 --- a/src/joshua/decoder/ff/tm/RuleCollection.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java @@ -16,11 +16,11 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm; +package org.apache.joshua.decoder.ff.tm; import java.util.List; -import joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureFunction; /** * A RuleCollection represents a set of rules that share the same source side (and hence the same @@ -29,7 +29,7 @@ * * @author Zhifei Li * @author Lane Schwartz - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public interface RuleCollection { @@ -37,18 +37,22 @@ public interface RuleCollection { * Returns true if the rules are sorted. This is used to allow rules to be sorted in an amortized * fashion; rather than sorting all trie nodes when the grammar is originally loaded, we sort them * only as the decoder actually needs them. + * @return true if rules are sorted */ boolean isSorted(); /** - * This returns a list of the rules, sorting them if necessary. + * This returns a list of the rules, sorting them if necessary. * - * Implementations of this function should be synchronized. + * Implementations of this function should be synchronized. + * @param models {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s + * @return the {@link java.util.List} of sorted rules */ List getSortedRules(List models); /** * Get the list of rules. There are no guarantees about whether they're sorted or not. + * @return the {@link java.util.List} of rules, there is no gurantee they will be sorted */ List getRules(); diff --git a/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java similarity index 92% rename from src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java index d5407276..2362cfd6 100644 --- a/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java @@ -16,25 +16,30 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm; +package org.apache.joshua.decoder.ff.tm; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.Map.Entry; -import joshua.decoder.ff.tm.hash_based.ExtensionIterator; -import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.ff.tm.hash_based.ExtensionIterator; +import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class implements dynamic sentence-level filtering. This is accomplished with a parallel * trie, a subset of the original trie, that only contains trie paths that are reachable from * traversals of the current sentence. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar { + + private static final Logger LOG = LoggerFactory.getLogger(SentenceFilteredGrammar.class); + private AbstractGrammar baseGrammar; private SentenceFilteredTrie filteredTrie; private int[] tokens; @@ -44,8 +49,8 @@ public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar { * Construct a new sentence-filtered grammar. The main work is done in the enclosed trie (obtained * from the base grammar, which contains the complete grammar). * - * @param baseGrammar - * @param sentence + * @param baseGrammar a new {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar} to populate + * @param sentence {@link org.apache.joshua.lattice.Lattice} input */ SentenceFilteredGrammar(AbstractGrammar baseGrammar, Sentence sentence) { super(baseGrammar.joshuaConfiguration); @@ -62,9 +67,8 @@ public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar { float seconds = (System.currentTimeMillis() - startTime) / 1000.0f; - System.err.println(String.format( - "Sentence-level filtering of sentence %d (%d -> %d rules) in %.3f seconds", sentence.id(), - origCount, filteredCount, seconds)); + LOG.debug("Sentence-level filtering of sentence {} ({} -> {} rules) in {} seconds", + sentence.id(), origCount, filteredCount, seconds); } @Override @@ -90,8 +94,8 @@ public int getNumRules() { /** * A convenience function that counts the number of rules in a grammar's trie. * - * @param node - * @return + * @param node the {@link org.apache.joshua.decoder.ff.tm.Trie} implementation for which to count rules + * @return the number of rules */ public int getNumRules(Trie node) { int numRules = 0; @@ -144,6 +148,7 @@ public boolean isRegexpGrammar() { * subsequent ones would have to consume just one word. We then just have to record in the * recursive call whether the last traversal was a nonterminal or not. * + * @param unfilteredTrieRoot todo * @return the root of the filtered trie */ private SentenceFilteredTrie filter(Trie unfilteredTrieRoot) { @@ -246,6 +251,7 @@ private void filter(int i, SentenceFilteredTrie trieNode, boolean lastWasNT) { * source side of each rule collection against the input sentence. Failed matches are discarded, * and trie nodes extending from that position need not be explored. * + * @param unfilteredTrie todo * @return the root of the filtered trie if any rules were retained, otherwise null */ @SuppressWarnings("unused") @@ -283,7 +289,7 @@ private boolean matchesSentence(Trie childTrie) { * Implements a filtered trie, by sitting on top of a base trie and annotating nodes that match * the given input sentence. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu * */ public class SentenceFilteredTrie implements Trie { @@ -297,8 +303,7 @@ public class SentenceFilteredTrie implements Trie { /** * Constructor. * - * @param trieRoot - * @param source + * @param unfilteredTrieNode todo */ public SentenceFilteredTrie(Trie unfilteredTrieNode) { this.unfilteredTrieNode = unfilteredTrieNode; diff --git a/src/joshua/decoder/ff/tm/Trie.java b/src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java similarity index 80% rename from src/joshua/decoder/ff/tm/Trie.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java index df481d64..51d2dd8f 100644 --- a/src/joshua/decoder/ff/tm/Trie.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm; +package org.apache.joshua.decoder.ff.tm; import java.util.Collection; import java.util.HashMap; @@ -25,15 +25,15 @@ /** * An interface for trie-like data structures. * - * @author wren ng thornton - * @author Zhifei Li, + * @author wren ng thornton wren@users.sourceforge.net + * @author Zhifei Li, zhifei.work@gmail.com */ public interface Trie { /** * Traverse one ply further down the trie. If there is no match, the result is null. * - * @param wordID + * @param wordID input word ID * @return Child node of this trie */ Trie match(int wordID); @@ -53,30 +53,30 @@ public interface Trie { * null. * * @return A list of extended Trie nodes if this node has extensions, - * null + * null * otherwise */ Collection getExtensions(); /** - * If the trie node has extensions, get a list of their labels. + * If the trie node has extensions, get a {@link java.util.HashMap} of their labels. * - * @return + * @return a {@link java.util.HashMap} pf node extensions */ HashMap getChildren(); /** * Returns an iterator over the trie node's extensions with terminal labels. * - * @return + * @return the {@link java.util.Iterator} created over the trie node's extensions with terminal labels */ Iterator getTerminalExtensionIterator(); /** * Returns an iterator over the trie node's extensions with nonterminal labels. * - * @return + * @return the {@link java.util.Iterator} created over the trie node's extensions with terminal labels */ Iterator getNonterminalExtensionIterator(); @@ -100,6 +100,8 @@ public interface Trie { * true. *
  • The collection must be sorted (at least as used by TMGrammar)
  • * + * @return a {@link org.apache.joshua.decoder.ff.tm.RuleCollection} representing the rules + * at the current node/state */ RuleCollection getRuleCollection(); diff --git a/src/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java b/src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java similarity index 97% rename from src/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java index 71fe6b2f..33587752 100644 --- a/src/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm; +package org.apache.joshua.decoder.ff.tm; /** * Unchecked runtime exception thrown to indicate that a collection of rules has not been properly diff --git a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java b/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java similarity index 93% rename from src/joshua/decoder/ff/tm/format/HieroFormatReader.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java index a47813dd..04a206a9 100644 --- a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java @@ -16,17 +16,16 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm.format; +package org.apache.joshua.decoder.ff.tm.format; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.tm.GrammarReader; -import joshua.decoder.ff.tm.Rule; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.tm.GrammarReader; +import org.apache.joshua.decoder.ff.tm.Rule; /** * This class implements reading files in the format defined by David Chiang for Hiero. * - * @author Unknown - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class HieroFormatReader extends GrammarReader { diff --git a/src/joshua/decoder/ff/tm/format/PhraseFormatReader.java b/src/main/java/org/apache/joshua/decoder/ff/tm/format/PhraseFormatReader.java similarity index 83% rename from src/joshua/decoder/ff/tm/format/PhraseFormatReader.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/format/PhraseFormatReader.java index be4d5221..870683ac 100644 --- a/src/joshua/decoder/ff/tm/format/PhraseFormatReader.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/format/PhraseFormatReader.java @@ -16,24 +16,24 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm.format; +package org.apache.joshua.decoder.ff.tm.format; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.tm.PhraseRule; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.tm.PhraseRule; +import org.apache.joshua.util.io.LineReader; /*** - * This class reads in the Moses phrase table format, with support for the source and target side, + *

    This class reads in the Moses phrase table format, with support for the source and target side, * list of features, and word alignments. It works by simply casting the phrase-based rules to - * left-branching hierarchical rules and passing them on to its parent class, {@HieroFormatReader}. + * left-branching hierarchical rules and passing them on to its parent class, {@link org.apache.joshua.decoder.ff.tm.format.HieroFormatReader}.

    * - * There is also a tool to convert the grammars directly, so that they can be suitably packed. Usage: + *

    There is also a tool to convert the grammars directly, so that they can be suitably packed. Usage:

    * *
    - *     cat PHRASE_TABLE | java -cp $JOSHUA/class joshua.decoder.ff.tm.format.PhraseFormatReader > grammar
    + *     cat PHRASE_TABLE | java -cp $JOSHUA/class org.apache.joshua.decoder.ff.tm.format.PhraseFormatReader > grammar
      * 
    * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu * */ @@ -116,7 +116,7 @@ public PhraseRule parseLine(String line) { /** * Converts a Moses phrase table to a Joshua grammar. * - * @param args + * @param args arguments required to do the conversion */ public static void main(String[] args) { PhraseFormatReader reader = new PhraseFormatReader(); diff --git a/src/joshua/decoder/ff/tm/format/SamtFormatReader.java b/src/main/java/org/apache/joshua/decoder/ff/tm/format/SamtFormatReader.java similarity index 90% rename from src/joshua/decoder/ff/tm/format/SamtFormatReader.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/format/SamtFormatReader.java index 6539d38a..c2657282 100644 --- a/src/joshua/decoder/ff/tm/format/SamtFormatReader.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/format/SamtFormatReader.java @@ -16,18 +16,18 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm.format; +package org.apache.joshua.decoder.ff.tm.format; -import java.util.logging.Logger; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.GrammarReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.GrammarReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class SamtFormatReader extends GrammarReader { - private static final Logger logger = Logger.getLogger(SamtFormatReader.class.getName()); - + private static final Logger LOG = LoggerFactory.getLogger(SamtFormatReader.class); private static final String samtNonTerminalMarkup; static { @@ -51,8 +51,8 @@ public SamtFormatReader(String grammarFile) { protected Rule parseLine(String line) { String[] fields = line.split(fieldDelimiter); if (fields.length != 4) { - logger.severe("Rule line does not have four fields: " + line); - logger.severe("Skipped."); + LOG.error("Rule line does not have four fields: {}", line); + LOG.error("Skipped."); return null; } diff --git a/src/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java similarity index 97% rename from src/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java index d6b5b974..ecb355dd 100644 --- a/src/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm.hash_based; +package org.apache.joshua.decoder.ff.tm.hash_based; import java.util.HashMap; import java.util.Iterator; diff --git a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java similarity index 86% rename from src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java index 4ba514a5..40997978 100644 --- a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java @@ -16,26 +16,28 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm.hash_based; +package org.apache.joshua.decoder.ff.tm.hash_based; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.JoshuaConfiguration.OOVItem; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.tm.AbstractGrammar; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.GrammarReader; -import joshua.decoder.ff.tm.Trie; -import joshua.decoder.ff.tm.format.HieroFormatReader; -import joshua.decoder.ff.tm.format.PhraseFormatReader; -import joshua.decoder.ff.tm.format.SamtFormatReader; -import joshua.util.FormatUtils; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.tm.AbstractGrammar; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.GrammarReader; +import org.apache.joshua.decoder.ff.tm.Trie; +import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader; +import org.apache.joshua.decoder.ff.tm.format.PhraseFormatReader; +import org.apache.joshua.decoder.ff.tm.format.SamtFormatReader; +import org.apache.joshua.util.FormatUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class implements a memory-based bilingual BatchGrammar. @@ -44,11 +46,13 @@ * french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key * in HashMap * - * @author Zhifei Li - * @author Matt Post featureFunctions) { @@ -288,7 +291,7 @@ public void addOOVRules(int sourceWord, List featureFunctions) /** * Adds a default set of glue rules. * - * @param featureFunctions + * @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s */ public void addGlueRules(ArrayList featureFunctions) { HieroFormatReader reader = new HieroFormatReader(); diff --git a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java similarity index 89% rename from src/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java index 194c594c..f91df1e1 100644 --- a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java @@ -16,15 +16,15 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm.hash_based; +package org.apache.joshua.decoder.ff.tm.hash_based; -import joshua.decoder.ff.tm.BasicRuleCollection; -import joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.BasicRuleCollection; +import org.apache.joshua.decoder.ff.tm.Rule; /** * Stores a collection of all rules with the same french side (and thus same arity). * - * @author Zhifei Li, + * @author Zhifei Li, zhifei.work@gmail.com */ public class MemoryBasedRuleBin extends BasicRuleCollection { diff --git a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java similarity index 92% rename from src/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java index baa46f7a..998688a9 100644 --- a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm.hash_based; +package org.apache.joshua.decoder.ff.tm.hash_based; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; -import joshua.decoder.ff.tm.RuleCollection; -import joshua.decoder.ff.tm.Trie; +import org.apache.joshua.decoder.ff.tm.RuleCollection; +import org.apache.joshua.decoder.ff.tm.Trie; /** - * @author Zhifei Li, + * @author Zhifei Li, zhifei.work@gmail.com */ public class MemoryBasedTrie implements Trie { MemoryBasedRuleBin ruleBin = null; diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java new file mode 100644 index 00000000..695a0a45 --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * Provides implementations of hierarchical phrase-based translation grammars. + */ +package org.apache.joshua.decoder.ff.tm.hash_based; diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java b/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java new file mode 100644 index 00000000..b804db6e --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * Defines interfaces and provides infrastructure for hierarchical + * phrase-based translation grammars. + */ +package org.apache.joshua.decoder.ff.tm; + diff --git a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java similarity index 94% rename from src/joshua/decoder/ff/tm/packed/PackedGrammar.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java index fb38cf04..c6dbadcf 100644 --- a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm.packed; +package org.apache.joshua.decoder.ff.tm.packed; /*** * This package implements Joshua's packed grammar structure, which enables the efficient loading @@ -80,35 +80,38 @@ import java.util.List; import java.util.Map; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.tm.AbstractGrammar; -import joshua.decoder.ff.tm.BasicRuleCollection; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.RuleCollection; -import joshua.decoder.ff.tm.Trie; -import joshua.decoder.ff.tm.hash_based.ExtensionIterator; -import joshua.util.encoding.EncoderConfiguration; -import joshua.util.encoding.FloatEncoder; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.tm.AbstractGrammar; +import org.apache.joshua.decoder.ff.tm.BasicRuleCollection; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.RuleCollection; +import org.apache.joshua.decoder.ff.tm.Trie; +import org.apache.joshua.decoder.ff.tm.hash_based.ExtensionIterator; +import org.apache.joshua.util.encoding.EncoderConfiguration; +import org.apache.joshua.util.encoding.FloatEncoder; +import org.apache.joshua.util.io.LineReader; import com.google.common.base.Supplier; import com.google.common.base.Suppliers; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class PackedGrammar extends AbstractGrammar { - private EncoderConfiguration encoding; + private static final Logger LOG = LoggerFactory.getLogger(PackedGrammar.class); + public static final String VOCABULARY_FILENAME = "vocabulary"; + private EncoderConfiguration encoding; private PackedRoot root; private ArrayList slices; - private final File vocabFile; // store path to vocabulary file - public static final String VOCABULARY_FILENAME = "vocabulary"; + private final File vocabFile; // store path to vocabulary file // The grammar specification keyword (e.g., "thrax" or "moses") private String type; @@ -118,14 +121,14 @@ public class PackedGrammar extends AbstractGrammar { private final Cache> cached_rules; public PackedGrammar(String grammar_dir, int span_limit, String owner, String type, - JoshuaConfiguration joshuaConfiguration) throws FileNotFoundException, IOException { + JoshuaConfiguration joshuaConfiguration) throws IOException { super(joshuaConfiguration); this.spanLimit = span_limit; this.type = type; // Read the vocabulary. vocabFile = new File(grammar_dir + File.separator + VOCABULARY_FILENAME); - Decoder.LOG(1, String.format("Reading vocabulary: %s", vocabFile)); + LOG.info("Reading vocabulary: {}", vocabFile); if (!Vocabulary.read(vocabFile)) { throw new RuntimeException("mismatches or collisions while reading on-disk vocabulary"); } @@ -133,12 +136,12 @@ public PackedGrammar(String grammar_dir, int span_limit, String owner, String ty // Read the config String configFile = grammar_dir + File.separator + "config"; if (new File(configFile).exists()) { - Decoder.LOG(1, String.format("Reading packed config: %s", configFile)); + LOG.info("Reading packed config: {}", configFile); readConfig(configFile); } // Read the quantizer setup. - Decoder.LOG(1, String.format("Reading encoder configuration: %s%sencoding", grammar_dir, File.separator)); + LOG.info("Reading encoder configuration: {}{}encoding", grammar_dir, File.separator); encoding = new EncoderConfiguration(); encoding.load(grammar_dir + File.separator + "encoding"); @@ -159,7 +162,7 @@ public PackedGrammar(String grammar_dir, int span_limit, String owner, String ty root = new PackedRoot(slices); cached_rules = CacheBuilder.newBuilder().maximumSize(joshuaConfiguration.cachedRuleSize).build(); - Decoder.LOG(1, String.format("Loaded %d rules", count)); + LOG.info("Loaded {} rules", count); } @Override @@ -192,6 +195,7 @@ public Rule constructManualRule(int lhs, int[] src, int[] tgt, float[] scores, i /** * Computes the MD5 checksum of the vocabulary file. * Can be used for comparing vocabularies across multiple packedGrammars. + * @return the computed checksum */ public String computeVocabularyChecksum() { MessageDigest md; @@ -519,12 +523,12 @@ private synchronized final byte[] getAlignmentArray(int block_id) { try { alignments.get(alignment, 0, num_points * 2); } catch (BufferUnderflowException bue) { - Decoder.LOG(4, "Had an exception when accessing alignment mapped byte buffer"); - Decoder.LOG(4, "Attempting to access alignments at position: " + alignment_position + 1); - Decoder.LOG(4, "And to read this many bytes: " + num_points * 2); - Decoder.LOG(4, "Buffer capacity is : " + alignments.capacity()); - Decoder.LOG(4, "Buffer position is : " + alignments.position()); - Decoder.LOG(4, "Buffer limit is : " + alignments.limit()); + LOG.warn("Had an exception when accessing alignment mapped byte buffer"); + LOG.warn("Attempting to access alignments at position: {}", alignment_position + 1); + LOG.warn("And to read this many bytes: {}", num_points * 2); + LOG.warn("Buffer capacity is : {}", alignments.capacity()); + LOG.warn("Buffer position is : {}", alignments.position()); + LOG.warn("Buffer limit is : {}", alignments.limit()); throw bue; } return alignment; @@ -801,7 +805,7 @@ public void remove() { * to then put a nonterminal on the source and target sides to treat the phrase pairs like * left-branching rules, which is how Joshua deals with phrase decoding. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu * */ public final class PackedPhrasePair extends PackedRule { @@ -856,7 +860,7 @@ private Supplier initializeAlignmentSupplier(){ /** * Take the English phrase of the underlying rule and prepend an [X]. * - * @return + * @return the augmented phrase */ @Override public int[] getEnglish() { @@ -866,7 +870,7 @@ public int[] getEnglish() { /** * Take the French phrase of the underlying rule and prepend an [X]. * - * @return + * @return the augmented French phrase */ @Override public int[] getFrench() { @@ -880,7 +884,7 @@ public int[] getFrench() { /** * Similarly the alignment array needs to be shifted over by one. * - * @return + * @return the byte[] alignment */ @Override public byte[] getAlignment() { diff --git a/src/joshua/decoder/ff/tm/packed/SliceAggregatingTrie.java b/src/main/java/org/apache/joshua/decoder/ff/tm/packed/SliceAggregatingTrie.java similarity index 89% rename from src/joshua/decoder/ff/tm/packed/SliceAggregatingTrie.java rename to src/main/java/org/apache/joshua/decoder/ff/tm/packed/SliceAggregatingTrie.java index 0cb7e26e..c6d03a64 100644 --- a/src/joshua/decoder/ff/tm/packed/SliceAggregatingTrie.java +++ b/src/main/java/org/apache/joshua/decoder/ff/tm/packed/SliceAggregatingTrie.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.ff.tm.packed; +package org.apache.joshua.decoder.ff.tm.packed; import static java.util.Collections.emptyList; import static java.util.Collections.unmodifiableList; @@ -30,35 +30,36 @@ import java.util.List; import java.util.Set; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.RuleCollection; -import joshua.decoder.ff.tm.Trie; -import joshua.decoder.ff.tm.hash_based.ExtensionIterator; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.RuleCollection; +import org.apache.joshua.decoder.ff.tm.Trie; +import org.apache.joshua.decoder.ff.tm.hash_based.ExtensionIterator; /** - * SliceAggregatingTrie collapses multiple tries - * with the same source root (i.e. tries from multiple packed slices). + *

    SliceAggregatingTrie collapses multiple tries + * with the same source root (i.e. tries from multiple packed slices).

    * - * Consider the example below. + *

    Consider the example below. * Without SliceAggregatingTries, the following grammar rules could have only - * partitioned by splitting rule lists when the first word of SOURCE changes. (">" markers). + * partitioned by splitting rule lists when the first word of SOURCE changes. (">" markers).

    * - * Using a SliceAggregatingTrie allows splitting at changes of second SOURCE words (">>" marker). + *

    Using a SliceAggregatingTrie allows splitting at changes of second SOURCE words (">>" marker).

    * + *
      * EXAMPLE: (LHS ||| SOURCE ||| TARGET)
      * [X] ||| - ||| -
    - * >
    + * >
      * [X] ||| [X] ||| [X]
    - * >>
    + * >>
      * [X] ||| [X] a ||| [X] A
      * [X] ||| [X] a ||| [X] A
    - * >>
    + * >>
      * [X] ||| [X] b ||| [X] B
    - * >
    + * >
      * [X] ||| u ||| u
    - * 
    - * A SliceAggregatingTrie node behaves just like a regular Trie node but subsumes a list of extensions/children.
    + * 
    + *

    A SliceAggregatingTrie node behaves just like a regular Trie node but subsumes a list of extensions/children. * This class hides the complexity of having multiple tries with the same root * from nodes one level up. * Similar to PackedRoot, it maintains a lookup table of children's @@ -70,7 +71,7 @@ * must be found in exactly one of the subtries. * (!) This assumption relies on the sort order of the packed grammar. * If the grammar was incorrectly sorted and then packed, construction - * of SliceAggregatingTrie nodes fails. + * of SliceAggregatingTrie nodes fails.

    * * @author fhieber */ diff --git a/src/joshua/decoder/hypergraph/AlignedSourceTokens.java b/src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java similarity index 86% rename from src/joshua/decoder/hypergraph/AlignedSourceTokens.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java index 5c6b2dd6..948001f7 100644 --- a/src/joshua/decoder/hypergraph/AlignedSourceTokens.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import java.util.LinkedList; import java.util.ListIterator; @@ -24,13 +24,17 @@ /** * Class that represents a one to (possibly) many alignment from target to * source. Extends from a LinkedList. Instances of this class are updated by the - * WordAlignmentExtractor.substitute() method. The method shifts the + * WordAlignmentExtractor.substitute() method. + * The {@link org.apache.joshua.decoder.hypergraph.AlignedSourceTokens#shiftBy(int, int)} + * method shifts the * elements in the list by a scalar to reflect substitutions of non terminals in * the rule. if indexes are final, i.e. the point instance has been substituted - * into a parent WordAlignmentState once, is set to true. This is + * into a parent WordAlignmentState once, + * {@link org.apache.joshua.decoder.hypergraph.AlignedSourceTokens#isFinal} is set to true. + * This is * necessary since the final source index of a point is known once we have * substituted in a complete WordAlignmentState into its parent. If the index in - * the list is a non terminal, = true + * the list is a non terminal, {@link org.apache.joshua.decoder.hypergraph.AlignedSourceTokens#isNonTerminal} = true */ class AlignedSourceTokens extends LinkedList { diff --git a/src/joshua/decoder/hypergraph/AllSpansWalker.java b/src/main/java/org/apache/joshua/decoder/hypergraph/AllSpansWalker.java similarity index 79% rename from src/joshua/decoder/hypergraph/AllSpansWalker.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/AllSpansWalker.java index 3964bb26..1aad06f4 100644 --- a/src/joshua/decoder/hypergraph/AllSpansWalker.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/AllSpansWalker.java @@ -16,18 +16,18 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import java.util.HashSet; import java.util.Set; -import joshua.corpus.Span; +import org.apache.joshua.corpus.Span; /*** * Uses {@link ForestWalker} to visit one {@link HGNode} per span of the chart. No guarantees are * provided as to which HGNode will be visited in each span. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu * */ @@ -42,11 +42,12 @@ public AllSpansWalker() { * This function wraps a {@link ForestWalker}, preventing calls to its walker function for all but * the first node reached for each span. * - * @param node - * @param walker + * @param node the {@link org.apache.joshua.decoder.hypergraph.HGNode} we wish to walk + * @param walker the {@link org.apache.joshua.decoder.hypergraph.WalkerFunction} + * implementation to do the walking */ public void walk(HGNode node, final WalkerFunction walker) { - new ForestWalker().walk(node, new joshua.decoder.hypergraph.WalkerFunction() { + new ForestWalker().walk(node, new org.apache.joshua.decoder.hypergraph.WalkerFunction() { @Override public void apply(HGNode node, int index) { if (node != null) { diff --git a/src/joshua/decoder/hypergraph/DefaultInsideOutside.java b/src/main/java/org/apache/joshua/decoder/hypergraph/DefaultInsideOutside.java similarity index 99% rename from src/joshua/decoder/hypergraph/DefaultInsideOutside.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/DefaultInsideOutside.java index 69d89b76..c6dae776 100644 --- a/src/joshua/decoder/hypergraph/DefaultInsideOutside.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/DefaultInsideOutside.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import java.util.HashMap; @@ -25,7 +25,7 @@ * to use the functions here, one need to extend the class to provide a way to calculate the * transitionLogP based on feature set * - * @author Zhifei Li, + * @author Zhifei Li, zhifei.work@gmail.com * @version $LastChangedDate$ */ diff --git a/src/joshua/decoder/hypergraph/FeatureVectorExtractor.java b/src/main/java/org/apache/joshua/decoder/hypergraph/FeatureVectorExtractor.java similarity index 84% rename from src/joshua/decoder/hypergraph/FeatureVectorExtractor.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/FeatureVectorExtractor.java index dbe4f4b0..a8525bea 100644 --- a/src/joshua/decoder/hypergraph/FeatureVectorExtractor.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/FeatureVectorExtractor.java @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; -import static joshua.decoder.chart_parser.ComputeNodeResult.computeTransitionFeatures; +import static org.apache.joshua.decoder.chart_parser.ComputeNodeResult.computeTransitionFeatures; import java.util.List; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.hypergraph.KBestExtractor.DerivationState; -import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.hypergraph.KBestExtractor.DerivationState; +import org.apache.joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor; +import org.apache.joshua.decoder.segment_file.Sentence; /** * During decoding, individual features values are not stored, only the model score on each edge. diff --git a/src/joshua/decoder/hypergraph/ForestWalker.java b/src/main/java/org/apache/joshua/decoder/hypergraph/ForestWalker.java similarity index 98% rename from src/joshua/decoder/hypergraph/ForestWalker.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/ForestWalker.java index 72b7fc7e..e58670a8 100644 --- a/src/joshua/decoder/hypergraph/ForestWalker.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/ForestWalker.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import java.util.HashSet; import java.util.Set; diff --git a/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java b/src/main/java/org/apache/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java similarity index 90% rename from src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java index 12e79c59..7908d28d 100644 --- a/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java @@ -16,17 +16,19 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import java.io.PrintStream; import java.util.HashSet; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.tm.Grammar; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.format.HieroFormatReader; -import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.tm.Grammar; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader; +import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This walker function builds up a new context-free grammar by visiting each node in a hypergraph. @@ -42,6 +44,9 @@ * non-terminal symbol is annotated with the span of its node. */ public class GrammarBuilderWalkerFunction implements WalkerFunction { + + private static final Logger LOG = LoggerFactory.getLogger(GrammarBuilderWalkerFunction.class); + private MemoryBasedBatchGrammar grammar; private static HieroFormatReader reader = new HieroFormatReader(); private PrintStream outStream; @@ -149,7 +154,7 @@ private static int[] getNewTargetFromSource(int[] source) { private static HGNode getGoalSymbolNode(HGNode root) { if (root.hyperedges == null || root.hyperedges.size() == 0) { - System.err.println("getGoalSymbolNode: root node has no hyperedges"); + LOG.error("getGoalSymbolNode: root node has no hyperedges"); return null; } return root.hyperedges.get(0).getTailNodes().get(0); @@ -158,7 +163,7 @@ private static HGNode getGoalSymbolNode(HGNode root) { public static int goalSymbol(HyperGraph hg) { if (hg.goalNode == null) { - System.err.println("goalSymbol: goalNode of hypergraph is null"); + LOG.error("goalSymbol: goalNode of hypergraph is null"); return -1; } HGNode symbolNode = getGoalSymbolNode(hg.goalNode); diff --git a/src/joshua/decoder/hypergraph/HGNode.java b/src/main/java/org/apache/joshua/decoder/hypergraph/HGNode.java similarity index 93% rename from src/joshua/decoder/hypergraph/HGNode.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/HGNode.java index c45f40c4..695cad53 100644 --- a/src/joshua/decoder/hypergraph/HGNode.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/HGNode.java @@ -16,20 +16,20 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import java.util.ArrayList; import java.util.Comparator; import java.util.List; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; /** * this class implement Hypergraph node (i.e., HGNode); also known as Item in parsing. * - * @author Zhifei Li, - * @author Juri Ganitkevitch, + * @author Zhifei Li, zhifei.work@gmail.com + * @author Juri Ganitkevitch, juri@cs.jhu.edu */ // TODO: handle the case that the Hypergraph only maintains the one-best tree @@ -93,6 +93,8 @@ public float getScore() { * Adds the hyperedge to the list of incoming hyperedges (i.e., ways to form this node), creating * the list if necessary. We then update the cache of the best incoming hyperedge via a call to * the (obscurely named) semiringPlus(). + * @param hyperEdge the {@link org.apache.joshua.decoder.hypergraph.HyperEdge} to add + * to the list of incoming hyperedges */ public void addHyperedgeInNode(HyperEdge hyperEdge) { if (hyperEdge != null) { @@ -106,6 +108,8 @@ public void addHyperedgeInNode(HyperEdge hyperEdge) { /** * Convenience function to add a list of hyperedges one at a time. + * @param hyperedges a {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HyperEdge}'s + * to add to the current HGNode. */ public void addHyperedgesInNode(List hyperedges) { for (HyperEdge hyperEdge : hyperedges) @@ -114,6 +118,7 @@ public void addHyperedgesInNode(List hyperedges) { /** * Updates the cache of the best incoming hyperedge. + * @param hyperEdge an incoming {{@link org.apache.joshua.decoder.hypergraph.HyperEdge} */ public void semiringPlus(HyperEdge hyperEdge) { if (null == bestHyperedge || bestHyperedge.getBestDerivationScore() < hyperEdge.getBestDerivationScore()) { @@ -241,9 +246,7 @@ public HGNode node() { */ // sort by estTotalLogP: for pruning purpose public int compareTo(HGNode anotherItem) { - System.out.println("HGNode, compare functiuon should never be called"); - System.exit(1); - return 0; + throw new RuntimeException("HGNode, compare functiuon should never be called"); /* * if (this.estTotalLogP > anotherItem.estTotalLogP) { return -1; } else if (this.estTotalLogP * == anotherItem.estTotalLogP) { return 0; } else { return 1; } diff --git a/src/joshua/decoder/hypergraph/HyperEdge.java b/src/main/java/org/apache/joshua/decoder/hypergraph/HyperEdge.java similarity index 93% rename from src/joshua/decoder/hypergraph/HyperEdge.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/HyperEdge.java index 114908ee..d7bcc4d3 100644 --- a/src/joshua/decoder/hypergraph/HyperEdge.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/HyperEdge.java @@ -16,18 +16,18 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import java.util.List; -import joshua.decoder.chart_parser.SourcePath; -import joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.chart_parser.SourcePath; +import org.apache.joshua.decoder.ff.tm.Rule; /** * this class implement Hyperedge * - * @author Zhifei Li, - * @author Matt Post + * @author Zhifei Li, zhifei.work@gmail.com + * @author Matt Post post@cs.jhu.edu */ public class HyperEdge { diff --git a/src/joshua/decoder/hypergraph/HyperGraph.java b/src/main/java/org/apache/joshua/decoder/hypergraph/HyperGraph.java similarity index 84% rename from src/joshua/decoder/hypergraph/HyperGraph.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/HyperGraph.java index 003c930d..499d4f3d 100644 --- a/src/joshua/decoder/hypergraph/HyperGraph.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/HyperGraph.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import java.io.IOException; import java.io.PrintWriter; @@ -24,14 +24,15 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.logging.Logger; -import joshua.corpus.Vocabulary; -import joshua.decoder.chart_parser.ComputeNodeResult; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.hypergraph.ForestWalker.TRAVERSAL; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.chart_parser.ComputeNodeResult; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.hypergraph.ForestWalker.TRAVERSAL; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * this class implement (1) HyperGraph-related data structures (Item and Hyper-edges) @@ -39,10 +40,12 @@ * Note: to seed the kbest extraction, each deduction should have the best_cost properly set. We do * not require any list being sorted * - * @author Zhifei Li, + * @author Zhifei Li, zhifei.work@gmail.com */ public class HyperGraph { + private static final Logger LOG = LoggerFactory.getLogger(HyperGraph.class); + // pointer to goal HGNode public HGNode goalNode = null; @@ -50,8 +53,6 @@ public class HyperGraph { public int numEdges = -1; public Sentence sentence = null; - static final Logger logger = Logger.getLogger(HyperGraph.class.getName()); - public HyperGraph(HGNode goalNode, int numNodes, int numEdges, Sentence sentence) { this.goalNode = goalNode; this.numNodes = numNodes; @@ -141,7 +142,8 @@ public void apply(HGNode node, int index) { /** * Dump the hypergraph to the specified file. * - * @param fileName + * @param fileName local file path + * @param model {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s */ public void dump(String fileName, List model) { try ( PrintWriter out = new PrintWriter(fileName, "UTF-8") ) { @@ -150,8 +152,8 @@ public void dump(String fileName, List model) { out.println(String.format("%d %d", numNodes, numEdges)); new ForestWalker(TRAVERSAL.POSTORDER).walk(this.goalNode, new HyperGraphDumper(out, model)); } catch (IOException e) { - System.err.println("* Can't dump hypergraph to file '" + fileName + "'"); - e.printStackTrace(); + LOG.error("Can't dump hypergraph to file '{}'", fileName); + LOG.error(e.getMessage(), e); } } diff --git a/src/joshua/decoder/hypergraph/HyperGraphPruning.java b/src/main/java/org/apache/joshua/decoder/hypergraph/HyperGraphPruning.java similarity index 97% rename from src/joshua/decoder/hypergraph/HyperGraphPruning.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/HyperGraphPruning.java index 98b97d39..27f55254 100644 --- a/src/joshua/decoder/hypergraph/HyperGraphPruning.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/HyperGraphPruning.java @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import java.util.HashMap; -import joshua.corpus.Vocabulary; +import org.apache.joshua.corpus.Vocabulary; /** * during the pruning process, many Item/Deductions may not be explored at all due to the early-stop * in pruning_deduction * - * @author Zhifei Li, + * @author Zhifei Li, zhifei.work@gmail.com * @version $LastChangedDate$ */ public class HyperGraphPruning extends TrivialInsideOutside { diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/main/java/org/apache/joshua/decoder/hypergraph/KBestExtractor.java similarity index 90% rename from src/joshua/decoder/hypergraph/KBestExtractor.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/KBestExtractor.java index 6dd3207c..9e7cbbb0 100644 --- a/src/joshua/decoder/hypergraph/KBestExtractor.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/KBestExtractor.java @@ -16,79 +16,80 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; -import static joshua.util.FormatUtils.unescapeSpecialSymbols; -import static joshua.util.FormatUtils.removeSentenceMarkers; +import static org.apache.joshua.util.FormatUtils.unescapeSpecialSymbols; +import static org.apache.joshua.util.FormatUtils.removeSentenceMarkers; import java.io.BufferedWriter; import java.io.IOException; import java.io.OutputStreamWriter; -import java.util.Arrays; -import java.util.Comparator; import java.util.ArrayList; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.PriorityQueue; -import joshua.corpus.Vocabulary; -import joshua.decoder.BLEU; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.fragmentlm.Tree; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.io.DeNormalize; -import joshua.decoder.segment_file.Sentence; -import joshua.decoder.segment_file.Token; -import joshua.util.FormatUtils; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.BLEU; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.fragmentlm.Tree; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.io.DeNormalize; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.segment_file.Token; +import org.apache.joshua.util.FormatUtils; + +import cern.colt.Arrays; /** - * This class implements lazy k-best extraction on a hyper-graph. + *

    This class implements lazy k-best extraction on a hyper-graph.

    * - * K-best extraction over hypergraphs is a little hairy, but is best understood in the following + *

    K-best extraction over hypergraphs is a little hairy, but is best understood in the following * manner. Imagine a hypergraph, which is composed of nodes connected by hyperedges. A hyperedge has * exactly one parent node and 1 or more tail nodes, corresponding to the rank of the rule that gave - * rise to the hyperedge. Each node has 1 or more incoming hyperedges. + * rise to the hyperedge. Each node has 1 or more incoming hyperedges.

    * - * K-best extraction works in the following manner. A derivation is a set of nodes and hyperedges + *

    K-best extraction works in the following manner. A derivation is a set of nodes and hyperedges * that leads from the root node down and exactly covers the source-side sentence. To define a * derivation, we start at the root node, choose one of its incoming hyperedges, and then recurse to - * the tail (or antecedent) nodes of that hyperedge, where we continually make the same decision. + * the tail (or antecedent) nodes of that hyperedge, where we continually make the same decision.

    * - * Each hypernode has its hyperedges sorted according to their model score. To get the best + *

    Each hypernode has its hyperedges sorted according to their model score. To get the best * (Viterbi) derivation, we simply recursively follow the best hyperedge coming in to each - * hypernode. + * hypernode.

    * - * How do we get the second-best derivation? It is defined by changing exactly one of the decisions + *

    How do we get the second-best derivation? It is defined by changing exactly one of the decisions * about which hyperedge to follow in the recursion. Somewhere, we take the second-best. Similarly, * the third-best derivation makes a single change from the second-best: either making another * (differnt) second-best choice somewhere along the 1-best derivation, or taking the third-best - * choice at the same spot where the second-best derivation took the second-best choice. And so on. + * choice at the same spot where the second-best derivation took the second-best choice. And so on.

    * - * This class uses two classes that encode the necessary meta-information. The first is the + *

    This class uses two classes that encode the necessary meta-information. The first is the * DerivationState class. It roughly corresponds to a hyperedge, and records, for each of that * hyperedge's tail nodes, which-best to take. So for a hyperedge with three tail nodes, the 1-best * derivation will be (1,1,1), the second-best will be one of (2,1,1), (1,2,1), or (1,1,2), the - * third best will be one of + * third best will be one of

    * - * (3,1,1), (2,2,1), (1,1,3) + * (3,1,1), (2,2,1), (1,1,3) * - * and so on. + *

    and so on.

    * - * The configuration parameter `output-format` controls what exactly is extracted from the forest. + *

    The configuration parameter `output-format` controls what exactly is extracted from the forest. * See documentation for that below. Note that Joshua does not store individual feature values while * decoding, but only the cost of each edge (in the form of a float). Therefore, if you request * the features values (`%f` in `output-format`), the feature functions must be replayed, which - * is expensive. + * is expensive.

    * - * The configuration parameter `top-n` controls how many items are returned. If this is set to 0, - * k-best extraction should be turned off entirely. + *

    The configuration parameter `top-n` controls how many items are returned. If this is set to 0, + * k-best extraction should be turned off entirely.

    * - * @author Zhifei Li, - * @author Matt Post + * @author Zhifei Li, zhifei.work@gmail.com + * @author Matt Post post@cs.jhu.edu */ public class KBestExtractor { private final JoshuaConfiguration joshuaConfiguration; @@ -161,7 +162,12 @@ public DerivationState getKthDerivation(HGNode node, int k) { * Compute the string that is output from the decoder, using the "output-format" config file * parameter as a template. * - * You may need to reset_state() before you call this function for the first time. + * You may need to {@link org.apache.joshua.decoder.hypergraph.KBestExtractor#resetState()} + * before you call this function for the first time. + * + * @param node todo + * @param k todo + * @return todo */ public String getKthHyp(HGNode node, int k) { @@ -227,9 +233,9 @@ public String getKthHyp(HGNode node, int k) { * If requested, projects source-side lettercase to target, and appends the alignment from * to the source-side sentence in ||s. * - * @param hypothesis - * @param state - * @return + * @param hypothesis todo + * @param state todo + * @return source-side lettercase to target, and appends the alignment from to the source-side sentence in ||s */ private String maybeProjectCase(String hypothesis, DerivationState state) { String output = hypothesis; @@ -259,6 +265,9 @@ else if (annotation.equals("all-upper")) /** * Convenience function for k-best extraction that prints to STDOUT. + * @param hg the {@link org.apache.joshua.decoder.hypergraph.HyperGraph} from which to extract + * @param topN the number of k + * @throws IOException if there is an error writing the extraction */ public void lazyKBestExtractOnHG(HyperGraph hg, int topN) throws IOException { lazyKBestExtractOnHG(hg, topN, new BufferedWriter(new OutputStreamWriter(System.out))); @@ -278,7 +287,7 @@ public void lazyKBestExtractOnHG(HyperGraph hg, int topN) throws IOException { * @param hg the hypergraph to extract from * @param topN how many to extract * @param out object to write to - * @throws IOException + * @throws IOException if there is an error writing the extraction */ public void lazyKBestExtractOnHG(HyperGraph hg, int topN, BufferedWriter out) throws IOException { @@ -307,11 +316,13 @@ public void resetState() { } /** - * Returns the VirtualNode corresponding to an HGNode. If no such VirtualNode exists, it is - * created. + * Returns the {@link org.apache.joshua.decoder.hypergraph.KBestExtractor.VirtualNode} + * corresponding to an {@link org.apache.joshua.decoder.hypergraph.HGNode}. + * If no such VirtualNode exists, it is created. * - * @param hgnode - * @return the corresponding VirtualNode + * @param hgnode from which we wish to create a + * {@link org.apache.joshua.decoder.hypergraph.KBestExtractor.VirtualNode} + * @return the corresponding {@link org.apache.joshua.decoder.hypergraph.KBestExtractor.VirtualNode} */ private VirtualNode getVirtualNode(HGNode hgnode) { VirtualNode virtualNode = virtualNodesTable.get(hgnode); @@ -329,7 +340,6 @@ private VirtualNode getVirtualNode(HGNode hgnode) { * k-best derivations from that point on, retaining the derivations computed so far and a priority * queue of candidates. */ - private class VirtualNode { // The node being annotated. @@ -356,7 +366,7 @@ public VirtualNode(HGNode it) { /** * This returns a DerivationState corresponding to the kth-best derivation rooted at this node. * - * @param kbestExtractor + * @param kbestExtractor todo * @param k (indexed from one) * @return the k-th best (1-indexed) hypothesis, or null if there are no more. */ @@ -650,7 +660,7 @@ public DerivationState(HGNode pa, HyperEdge e, int[] r, float c, int pos) { * assumption that the total number of words in the hypothesis scales linearly with the input * sentence span. * - * @return + * @return float representing {@link org.apache.joshua.decoder.BLEU} score */ public float computeBLEU() { if (stats == null) { @@ -677,7 +687,7 @@ public void setCost(float cost2) { * Returns the model cost. This is obtained by subtracting off the incorporated BLEU score (if * used). * - * @return + * @return float representing model cost */ public float getModelCost() { return this.cost; @@ -686,7 +696,7 @@ public float getModelCost() { /** * Returns the model cost plus the BLEU score. * - * @return + * @return float representing model cost plus the BLEU score */ public float getCost() { return cost - weights.getSparse("BLEU") * bleu; @@ -724,6 +734,7 @@ public boolean equals(Object other) { /** * DerivationState objects are unique to each VirtualNode, so the unique identifying information * only need contain the edge position and the ranks. + * @return hashof the edge position and ranks */ public int hashCode() { int hash = edgePos; @@ -737,6 +748,8 @@ public int hashCode() { /** * Visits every state in the derivation in a depth-first order. + * @param visitor todo + * @return todo */ private DerivationVisitor visit(DerivationVisitor visitor) { return visit(visitor, 0, 0); @@ -807,9 +820,9 @@ private String getDerivation() { * function looks up the VirtualNode corresponding to the HGNode pointed to by the edge's * {tailNodeIndex}th tail node. * - * @param edge - * @param tailNodeIndex - * @return + * @param edge todo + * @param tailNodeIndex todo + * @return todo */ public DerivationState getChildDerivationState(HyperEdge edge, int tailNodeIndex) { HGNode child = edge.getTailNodes().get(tailNodeIndex); @@ -839,7 +852,7 @@ public int compare(DerivationState one, DerivationState another) { * way to do different things to the tree (e.g., extract its words, assemble a derivation, and so * on) without having to rewrite the node-visiting code. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public interface DerivationVisitor { /** @@ -952,7 +965,7 @@ private void merge(Tree fragment) { * Assembles an informative version of the derivation. Each rule is printed as it is encountered. * Don't try to parse this output; make something that writes out JSON or something, instead. * - * @author Matt Post + * @author Zhifei Li, zhifei.work@gmail.com * @version $LastChangedDate$ */ diff --git a/src/joshua/decoder/hypergraph/ViterbiExtractor.java b/src/main/java/org/apache/joshua/decoder/hypergraph/ViterbiExtractor.java similarity index 79% rename from src/joshua/decoder/hypergraph/ViterbiExtractor.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/ViterbiExtractor.java index 31c8dc0c..734e0aaf 100644 --- a/src/joshua/decoder/hypergraph/ViterbiExtractor.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/ViterbiExtractor.java @@ -16,20 +16,20 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import static java.util.Collections.emptyList; import java.util.ArrayList; import java.util.List; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.segment_file.Sentence; /** - * @author Zhifei Li, - * @author Matt Post + * @author Zhifei Li, zhifei.work@gmail.com + * @author Matt Post post@cs.jhu.edu */ public class ViterbiExtractor { @@ -61,25 +61,33 @@ public static void viterbiWalk( } } } - + public static void viterbiWalk(final HGNode node, final WalkerFunction walker) { viterbiWalk(node, walker, 0); } - + /** * Returns the Viterbi translation of the Hypergraph (includes sentence markers) + * @param hg a {@link org.apache.joshua.decoder.hypergraph.HyperGraph} we wish to + * obtain a Viterbi translation for + * @return a String Viterbi translation */ public static String getViterbiString(final HyperGraph hg) { if (hg == null) return ""; - + final WalkerFunction viterbiOutputStringWalker = new OutputStringExtractor(false); viterbiWalk(hg.goalNode, viterbiOutputStringWalker); return viterbiOutputStringWalker.toString(); } - + /** * Returns the Viterbi feature vector + * @param hg a {@link org.apache.joshua.decoder.hypergraph.HyperGraph} we wish to + * obtain a Viterbi features for + * @param featureFunctions a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s + * @param sentence {@link org.apache.joshua.lattice.Lattice} input + * @return a Viterbi {@link org.apache.joshua.decoder.ff.FeatureVector} */ public static FeatureVector getViterbiFeatures( final HyperGraph hg, @@ -87,38 +95,46 @@ public static FeatureVector getViterbiFeatures( final Sentence sentence) { if (hg == null) return new FeatureVector(); - + final FeatureVectorExtractor extractor = new FeatureVectorExtractor( featureFunctions, sentence); - viterbiWalk(hg.goalNode, extractor); - return extractor.getFeatures(); + viterbiWalk(hg.goalNode, extractor); + return extractor.getFeatures(); } - + /** * Returns the Viterbi Word Alignments as String. + * @param hg input {@link org.apache.joshua.decoder.hypergraph.HyperGraph} + * @return the Viterbi Word Alignments as String */ public static String getViterbiWordAlignments(final HyperGraph hg) { if (hg == null) return ""; - + final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor(); viterbiWalk(hg.goalNode, wordAlignmentWalker); return wordAlignmentWalker.toString(); } - + /** * Returns the Viterbi Word Alignments as list of lists (target-side). + * @param hg input {@link org.apache.joshua.decoder.hypergraph.HyperGraph} + * @return a {@link java.util.List} of Viterbi Word Alignments */ public static List> getViterbiWordAlignmentList(final HyperGraph hg) { if (hg == null) return emptyList(); - + final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor(); viterbiWalk(hg.goalNode, wordAlignmentWalker); return wordAlignmentWalker.getFinalWordAlignments(); } - - /** find 1best hypergraph */ + + /** + * find 1best hypergraph + * @param hg_in input {@link org.apache.joshua.decoder.hypergraph.HyperGraph} + * @return new best {@link org.apache.joshua.decoder.hypergraph.HyperGraph} + */ public static HyperGraph getViterbiTreeHG(HyperGraph hg_in) { HyperGraph res = new HyperGraph(cloneNodeWithBestHyperedge(hg_in.goalNode), -1, -1, null); @@ -152,7 +168,7 @@ private static HyperEdge cloneHyperedge(HyperEdge inEdge) { List antNodes = null; if (null != inEdge.getTailNodes()) { antNodes = new ArrayList(inEdge.getTailNodes());// l_ant_items will be changed in - // get_1best_tree_item + // get_1best_tree_item } HyperEdge res = new HyperEdge(inEdge.getRule(), inEdge.getBestDerivationScore(), inEdge.getTransitionLogP(false), diff --git a/src/joshua/decoder/hypergraph/WalkerFunction.java b/src/main/java/org/apache/joshua/decoder/hypergraph/WalkerFunction.java similarity index 83% rename from src/joshua/decoder/hypergraph/WalkerFunction.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/WalkerFunction.java index 65bffbf9..811521cf 100644 --- a/src/joshua/decoder/hypergraph/WalkerFunction.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/WalkerFunction.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; /** * Classes implementing this interface define a single function that is applied to each node. This @@ -28,6 +28,9 @@ public interface WalkerFunction { * Function that is applied to node at tail node index nodeIndex. * nodeIndex indicates the index of node in the list of tailnodes for the * outgoing edge. + * @param node the {{@link org.apache.joshua.decoder.hypergraph.HGNode} we + * wish to apply some Walker Function to. + * @param nodeIndex node in the list of tailnodes for the outgoing edge */ void apply(HGNode node, int nodeIndex); diff --git a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java b/src/main/java/org/apache/joshua/decoder/hypergraph/WordAlignmentExtractor.java similarity index 93% rename from src/joshua/decoder/hypergraph/WordAlignmentExtractor.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/WordAlignmentExtractor.java index 837c69fb..04d08979 100644 --- a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/WordAlignmentExtractor.java @@ -16,16 +16,16 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import static java.util.Collections.emptyList; import java.util.List; import java.util.Stack; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.KBestExtractor.DerivationState; -import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.KBestExtractor.DerivationState; +import org.apache.joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor; /** * This class enables extraction of word-level alignments from hypotheses. @@ -96,6 +96,7 @@ public void after(final DerivationState state, final int level, int tailNodeInde /** * Final word alignment without sentence markers * or empty list if stack is empty. + * @return a final alignment list */ public List> getFinalWordAlignments() { if (stack.isEmpty()) { diff --git a/src/joshua/decoder/hypergraph/WordAlignmentState.java b/src/main/java/org/apache/joshua/decoder/hypergraph/WordAlignmentState.java similarity index 97% rename from src/joshua/decoder/hypergraph/WordAlignmentState.java rename to src/main/java/org/apache/joshua/decoder/hypergraph/WordAlignmentState.java index 258e0629..51406520 100644 --- a/src/joshua/decoder/hypergraph/WordAlignmentState.java +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/WordAlignmentState.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.hypergraph; +package org.apache.joshua.decoder.hypergraph; import java.util.ArrayList; import java.util.LinkedList; @@ -24,7 +24,7 @@ import java.util.ListIterator; import java.util.Map; -import joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.Rule; /** * This class encodes a derivation state in terms of a list of alignment points. @@ -83,6 +83,7 @@ public class WordAlignmentState { /** * if there are no more NonTerminals to substitute, * this state is said to be complete + * @return true if complete */ public boolean isComplete() { return numNT == 0; @@ -91,6 +92,7 @@ public boolean isComplete() { /** * builds the final alignment string in the standard alignment format: src - * trg. Sorted by trg indexes. Disregards the sentence markers. + * @return result string */ public String toFinalString() { StringBuilder sb = new StringBuilder(); @@ -111,6 +113,7 @@ public String toFinalString() { * builds the final alignment list. * each entry in the list corresponds to a list of aligned source tokens. * First and last item in trgPoints is skipped. + * @return a final alignment list */ public List> toFinalList() { assert (isComplete() == true); diff --git a/src/main/java/org/apache/joshua/decoder/hypergraph/package-info.java b/src/main/java/org/apache/joshua/decoder/hypergraph/package-info.java new file mode 100644 index 00000000..05e66e2e --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/hypergraph/package-info.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * Provides implementations of hypergraph data structures + * and related algorithms used in extracting translation + * results in hierarchical phrase-based translation. + */ +package org.apache.joshua.decoder.hypergraph; \ No newline at end of file diff --git a/src/joshua/decoder/io/DeNormalize.java b/src/main/java/org/apache/joshua/decoder/io/DeNormalize.java similarity index 96% rename from src/joshua/decoder/io/DeNormalize.java rename to src/main/java/org/apache/joshua/decoder/io/DeNormalize.java index 328e01b3..cc6e8393 100644 --- a/src/joshua/decoder/io/DeNormalize.java +++ b/src/main/java/org/apache/joshua/decoder/io/DeNormalize.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.io; +package org.apache.joshua.decoder.io; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -26,18 +26,17 @@ *
      *
    • Capitalize the first character in the string
    • *
    • Detokenize
    • - *
        *
      • Delete whitespace in front of periods and commas
      • *
      • Join contractions
      • *
      • Capitalize name titles (Mr Ms Miss Dr etc.)
      • - *
      • TODO: Handle surrounding characters ([{<"''">}])
      • + *
      • TODO: Handle surrounding characters ([{<"''">}])
      • *
      • TODO: Join multi-period abbreviations (e.g. M.Phil. i.e.)
      • *
      • TODO: Handle ambiguities like "st.", which can be an abbreviation for both "Saint" and * "street"
      • *
      • TODO: Capitalize both the title and the name of a person, e.g. Mr. Morton (named entities * should be demarcated).
      • *
      - *
    N.B. These methods all assume that every translation result that will be + * N.B. These methods all assume that every translation result that will be * denormalized has the following format: *
      *
    • There is only one space between every pair of tokens
    • @@ -45,7 +44,6 @@ *
    • There is no whitespace after the final token
    • *
    • Standard spaces are the only type of whitespace
    • *
    - * */ public class DeNormalize { @@ -53,8 +51,8 @@ public class DeNormalize { /** * Apply all the denormalization methods to the normalized input line. * - * @param normalized - * @return + * @param normalized a normalized input line + * @return the denormalized String */ public static String processSingleLine(String normalized) { // The order in which the methods are applied could matter in some situations. E.g., a token to diff --git a/src/joshua/decoder/io/JSONMessage.java b/src/main/java/org/apache/joshua/decoder/io/JSONMessage.java similarity index 97% rename from src/joshua/decoder/io/JSONMessage.java rename to src/main/java/org/apache/joshua/decoder/io/JSONMessage.java index 2733db41..50d9ef48 100644 --- a/src/joshua/decoder/io/JSONMessage.java +++ b/src/main/java/org/apache/joshua/decoder/io/JSONMessage.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.io; +package org.apache.joshua.decoder.io; import java.util.ArrayList; import java.util.List; @@ -24,7 +24,7 @@ import com.google.gson.Gson; import com.google.gson.GsonBuilder; -import joshua.decoder.Translation; +import org.apache.joshua.decoder.Translation; public class JSONMessage { public Data data = null; diff --git a/src/joshua/decoder/io/TranslationRequestStream.java b/src/main/java/org/apache/joshua/decoder/io/TranslationRequestStream.java similarity index 94% rename from src/joshua/decoder/io/TranslationRequestStream.java rename to src/main/java/org/apache/joshua/decoder/io/TranslationRequestStream.java index 47f5d815..432f1fbc 100644 --- a/src/joshua/decoder/io/TranslationRequestStream.java +++ b/src/main/java/org/apache/joshua/decoder/io/TranslationRequestStream.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.io; +package org.apache.joshua.decoder.io; import java.io.BufferedReader; import java.io.IOException; @@ -24,17 +24,17 @@ import com.google.gson.stream.JsonReader; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.JoshuaConfiguration.INPUT_TYPE; -import joshua.decoder.MetaDataException; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.JoshuaConfiguration.INPUT_TYPE; +import org.apache.joshua.decoder.MetaDataException; +import org.apache.joshua.decoder.segment_file.Sentence; /** * This class iterates over an input stream, looking for inputs to translate. By default, it * expects plain-text input, which can be plain sentences or PLF-encoded lattices. If * '-input-type json' is passed to the decoder, it will instead read JSON objects from the input * stream, with the following format: - * + *
      * {
      *   "data": {
      *     "translations": [
    @@ -44,8 +44,8 @@
      *     ]
      *   }
      * }
    - * 
    - * @author Matt Post 
    + * 
    + * @author Matt Post post@cs.jhu.edu * @author orluke */ public class TranslationRequestStream { diff --git a/src/main/java/org/apache/joshua/decoder/package-info.java b/src/main/java/org/apache/joshua/decoder/package-info.java new file mode 100644 index 00000000..af1127b2 --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/package-info.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * Provides infrastructure and wrapper code used relevant to + * hierarchical phrase-based decoding for statistical machine + * translation. This package does not include an implementation + * of any actual decoding algorithm. Rather, such code is in + * child packages of this package. + */ +package org.apache.joshua.decoder; \ No newline at end of file diff --git a/src/joshua/decoder/phrase/Candidate.java b/src/main/java/org/apache/joshua/decoder/phrase/Candidate.java similarity index 91% rename from src/joshua/decoder/phrase/Candidate.java rename to src/main/java/org/apache/joshua/decoder/phrase/Candidate.java index 4b8b6a62..ee8a2a9a 100644 --- a/src/joshua/decoder/phrase/Candidate.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/Candidate.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; /*** * A candidate is basically a cube prune state. It contains a list of hypotheses and target @@ -28,11 +28,11 @@ import java.util.Arrays; import java.util.List; -import joshua.corpus.Span; -import joshua.decoder.chart_parser.ComputeNodeResult; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.corpus.Span; +import org.apache.joshua.decoder.chart_parser.ComputeNodeResult; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.hypergraph.HGNode; public class Candidate { @@ -170,7 +170,7 @@ public Hypothesis getHypothesis() { } /** - * This returns the target side {@link Phrase}, which is a {@link Rule} object. This is just a + * This returns the target side {@link org.apache.joshua.corpus.Phrase}, which is a {@link org.apache.joshua.decoder.ff.tm.Rule} object. This is just a * convenience function that works by returning the phrase indexed in ranks[1]. * * @return the phrase at position ranks[1] @@ -194,7 +194,7 @@ public List getTailNodes() { /** * Returns the bit vector of this hypothesis. The bit vector is computed by ORing the coverage * vector of the tail node (hypothesis) and the source span of phrases in this candidate. - * @return + * @return the bit vector of this hypothesis */ public Coverage getCoverage() { Coverage cov = new Coverage(getHypothesis().getCoverage()); @@ -203,9 +203,9 @@ public Coverage getCoverage() { } /** - * Sets the result of a candidate (should just be moved to the constructor). + * Sets the result of a candidate (TODO should just be moved to the constructor). * - * @param result + * @param result todo */ public void setResult(ComputeNodeResult result) { this.result = result; @@ -221,7 +221,7 @@ public void setResult(ComputeNodeResult result) { * The Future Cost item should probably just be implemented as another kind of feature function, * but it would require some reworking of that interface, which isn't worth it. * - * @return + * @return the sum of two costs: the HypoState cost + the transition cost */ public float score() { return getHypothesis().getScore() + future_delta + result.getTransitionCost(); diff --git a/src/joshua/decoder/phrase/CandidateComparator.java b/src/main/java/org/apache/joshua/decoder/phrase/CandidateComparator.java similarity index 96% rename from src/joshua/decoder/phrase/CandidateComparator.java rename to src/main/java/org/apache/joshua/decoder/phrase/CandidateComparator.java index 2526ed66..322f47a4 100644 --- a/src/joshua/decoder/phrase/CandidateComparator.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/CandidateComparator.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; import java.util.Comparator; diff --git a/src/joshua/decoder/phrase/Coverage.java b/src/main/java/org/apache/joshua/decoder/phrase/Coverage.java similarity index 88% rename from src/joshua/decoder/phrase/Coverage.java rename to src/main/java/org/apache/joshua/decoder/phrase/Coverage.java index 398c7a04..2c674fcf 100644 --- a/src/joshua/decoder/phrase/Coverage.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/Coverage.java @@ -16,11 +16,11 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; import java.util.BitSet; -import joshua.corpus.Span; +import org.apache.joshua.corpus.Span; /** * Represents a coverage vector. The vector is relative to a hypothesis. {firstZero} denotes the @@ -29,7 +29,7 @@ */ public class Coverage { - + // The index of the first uncovered word private int firstZero; @@ -45,12 +45,12 @@ public Coverage() { firstZero = 0; bits = new BitSet(INITIAL_LENGTH); } - + public Coverage(int firstZero) { this.firstZero = firstZero; bits = new BitSet(INITIAL_LENGTH); } - + /** * Pretty-prints the coverage vector, making a guess about the length */ @@ -69,8 +69,7 @@ public String toString() { /** * Initialize a coverage vector from another Coverage vector, creating a separate object. * - * @param firstZero - * @param bits + * @param other an existing coverage vector from which to create a new coverage vector */ public Coverage(Coverage other) { this.firstZero = other.firstZero; @@ -81,14 +80,14 @@ public Coverage(Coverage other) { * Turns on all bits from position start to position (end - 1), that is, in the range [start .. end). * This is done relative to the current coverage vector, of course, which may not start at 0. * - * @param begin - * @param end + * @param begin bits at start position + * @param end bits at end position (end - 1) */ public void set(int begin, int end) { assert compatible(begin, end); -// StringBuffer sb = new StringBuffer(); -// sb.append(String.format("SET(%d,%d) %s", begin, end, this)); + // StringBuffer sb = new StringBuffer(); + // sb.append(String.format("SET(%d,%d) %s", begin, end, this)); if (begin == firstZero) { // A concatenation. @@ -106,12 +105,13 @@ public void set(int begin, int end) { bits.or(pattern(begin, end)); } -// sb.append(String.format(" -> %s", this)); -// System.err.println(sb); + // sb.append(String.format(" -> %s", this)); + // System.err.println(sb); } - + /** * Convenience function. + * @param span todo */ public final void set(Span span) { set(span.start, span.end); @@ -134,7 +134,7 @@ public boolean compatible(int begin, int end) { } return false; } - + /** * Returns the source sentence index of the first uncovered word. * @@ -155,7 +155,7 @@ public int firstZero() { * Find the left bound of the gap in which the phrase [begin, ...) sits. * * @param begin the start index of the phrase being applied. - * @return + * @return todo */ public int leftOpening(int begin) { for (int i = begin - firstZero; i > 0; --i) { @@ -173,12 +173,16 @@ public int leftOpening(int begin) { /** * LeftOpen() and RightOpen() find the larger gap in which a new source phrase pair sits. * When using a phrase pair covering (begin, end), the pair - * + *
        *     (LeftOpen(begin), RightOpen(end, sentence_length))  
    +   * 
    * * provides this gap. * * Finds the right bound of the enclosing gap, or the end of sentence, whichever is less. + * @param end end of phrase pair + * @param sentenceLength length of sentence + * @return todo */ public int rightOpening(int end, int sentenceLength) { for (int i = end - firstZero; i < Math.min(64, sentenceLength - firstZero); i++) { @@ -188,7 +192,7 @@ public int rightOpening(int end, int sentenceLength) { } return sentenceLength; } - + /** * Creates a bit vector with the same offset as the current coverage vector, flipping on * bits begin..end. @@ -198,7 +202,7 @@ public int rightOpening(int end, int sentenceLength) { * @return a bit vector (relative) with positions [begin..end) on */ public BitSet pattern(int begin, int end) { -// System.err.println(String.format("pattern(%d,%d) %d %s %s", begin, end, firstZero, begin >= firstZero, toString())); + // System.err.println(String.format("pattern(%d,%d) %d %s %s", begin, end, firstZero, begin >= firstZero, toString())); assert begin >= firstZero; BitSet pattern = new BitSet(INITIAL_LENGTH); pattern.set(begin - firstZero, end - firstZero); @@ -208,12 +212,12 @@ public BitSet pattern(int begin, int end) { /** * Returns the underlying coverage bits. * - * @return + * @return {@link java.util.BitSet} vector of bits */ public BitSet getCoverage() { return bits; } - + @Override public boolean equals(Object obj) { if (obj instanceof Coverage) { diff --git a/src/joshua/decoder/phrase/Future.java b/src/main/java/org/apache/joshua/decoder/phrase/Future.java similarity index 70% rename from src/joshua/decoder/phrase/Future.java rename to src/main/java/org/apache/joshua/decoder/phrase/Future.java index 22a02255..0ece4a32 100644 --- a/src/joshua/decoder/phrase/Future.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/Future.java @@ -16,32 +16,28 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; -/*** - * This class represents the future cost of a hypothesis. The future cost of a hypothesis is the - * cost of covering all uncovered words. The way this is computed is with a simple dynamic program - * that computes, for each span of the input, the best possible way to cover that span with - * phrases from the phrase table. No non-local features (e.g., the language model cost) are used - * in computing this estimate. - */ - -import joshua.decoder.Decoder; -import joshua.util.ChartSpan; +import org.apache.joshua.util.ChartSpan; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class Future { - + + private static final Logger LOG = LoggerFactory.getLogger(Future.class); + // Square matrix with half the values ignored. private ChartSpan entries; private int sentlen; - + /** * Computes bottom-up the best way to cover all spans of the input sentence, using the phrases - * that have been assembled in a {@link PhraseChart}. Requires that there be a translation at least - * for every word (which can be accomplished with a pass-through grammar). + * that have been assembled in a {@link org.apache.joshua.decoder.phrase.PhraseChart}. + * Requires that there be a translation at least for every word (which can be + * accomplished with a pass-through grammar). * - * @param chart + * @param chart an input {@link org.apache.joshua.decoder.phrase.PhraseChart} */ public Future(PhraseChart chart) { @@ -58,7 +54,7 @@ public Future(PhraseChart chart) { // Insert phrases int max_end = Math.min(begin + chart.MaxSourcePhraseLength(), chart.SentenceLength()); for (int end = begin + 1; end <= max_end; end++) { - + // Moses doesn't include the cost of applying , so force it to zero if (begin == sentlen - 1 && end == sentlen) setEntry(begin, end, 0.0f); @@ -69,7 +65,7 @@ public Future(PhraseChart chart) { } } } - + // All the phrases are in, now do minimum dynamic programming. Lengths 0 and 1 were already handled above. for (int length = 2; length <= chart.SentenceLength(); length++) { for (int begin = 1; begin <= chart.SentenceLength() - length; begin++) { @@ -78,40 +74,46 @@ public Future(PhraseChart chart) { } } } - - if (Decoder.VERBOSE >= 3) { - for (int i = 1; i < chart.SentenceLength(); i++) - for (int j = i + 1; j < chart.SentenceLength(); j++) - System.err.println(String.format("future cost from %d to %d is %.3f", i-1, j-2, getEntry(i, j))); + + if (LOG.isDebugEnabled()) { + for (int i = 1; i < chart.SentenceLength(); i++) { + for (int j = i + 1; j < chart.SentenceLength(); j++) { + LOG.debug("future cost from {} to {} is {}", i - 1, j - 2, getEntry(i, j)); + } + } } } - + public float Full() { -// System.err.println("Future::Full(): " + Entry(1, sentlen)); + // System.err.println("Future::Full(): " + Entry(1, sentlen)); return getEntry(1, sentlen); } /** * Calculate change in rest cost when the given coverage is to be covered. - */ + * @param coverage input {@link org.apache.joshua.decoder.phrase.Coverage} vector + * @param begin word at which to begin within a sentence + * @param end word at which to end within a sentence + * @return a float value representing a {@link Future} entry + */ public float Change(Coverage coverage, int begin, int end) { int left = coverage.leftOpening(begin); int right = coverage.rightOpening(end, sentlen); -// System.err.println(String.format("Future::Change(%s, %d, %d) left %d right %d %.3f %.3f %.3f", coverage, begin, end, left, right, -// Entry(left, begin), Entry(end, right), Entry(left, right))); + // System.err.println(String.format("Future::Change(%s, %d, %d) left %d right %d %.3f %.3f %.3f", coverage, begin, end, left, right, + // Entry(left, begin), Entry(end, right), Entry(left, right))); return getEntry(left, begin) + getEntry(end, right) - getEntry(left, right); } - + private float getEntry(int begin, int end) { assert end >= begin; assert end < this.sentlen; return entries.get(begin, end); } - + private void setEntry(int begin, int end, float value) { assert end >= begin; assert end < this.sentlen; -// System.err.println(String.format("future cost from %d to %d is %.5f", begin, end, value)); + // System.err.println(String.format("future cost from %d to %d is %.5f", begin, end, value)); entries.set(begin, end, value); } } diff --git a/src/joshua/decoder/phrase/Header.java b/src/main/java/org/apache/joshua/decoder/phrase/Header.java similarity index 89% rename from src/joshua/decoder/phrase/Header.java rename to src/main/java/org/apache/joshua/decoder/phrase/Header.java index 2a8370d4..30d771c3 100644 --- a/src/joshua/decoder/phrase/Header.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/Header.java @@ -16,13 +16,19 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; // PORT: done +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.util.Comparator; public class Header implements Comparable
    , Comparator
    { + + private static final Logger LOG = LoggerFactory.getLogger(Header.class); + private float score; private int arity; private Note note; @@ -47,7 +53,7 @@ protected Header(int arity) { public boolean Valid() { // C++: return base_; - System.err.println("Header::Valid(): " + (note != null)); + LOG.debug("Header::Valid(): {}", (note != null)); return note != null; } diff --git a/src/joshua/decoder/phrase/Hypothesis.java b/src/main/java/org/apache/joshua/decoder/phrase/Hypothesis.java similarity index 88% rename from src/joshua/decoder/phrase/Hypothesis.java rename to src/main/java/org/apache/joshua/decoder/phrase/Hypothesis.java index 3d4bf51c..71d3df97 100644 --- a/src/joshua/decoder/phrase/Hypothesis.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/Hypothesis.java @@ -16,25 +16,25 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; import java.util.List; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.state_maintenance.DPState; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.format.HieroFormatReader; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.hypergraph.HyperEdge; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.state_maintenance.DPState; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.hypergraph.HyperEdge; /** - * Represents a hypothesis, a translation of some coverage of the input. Extends {@link HGNode}, - * through a bit of a hack. Whereas (i,j) represents the span of an {@link HGNode}, i here is not used, + * Represents a hypothesis, a translation of some coverage of the input. Extends {@link org.apache.joshua.decoder.hypergraph.HGNode}, + * through a bit of a hack. Whereas (i,j) represents the span of an {@link org.apache.joshua.decoder.hypergraph.HGNode}, i here is not used, * and j is overloaded to denote the span of the phrase being applied. The complete coverage vector * can be obtained by looking at the tail pointer and casting it. * * @author Kenneth Heafield - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class Hypothesis extends HGNode implements Comparable { @@ -86,7 +86,7 @@ public Rule getRule() { * HGNodes (designed for chart parsing) maintain a span (i,j). We overload j * here to record the index of the last translated source word. * - * @return + * @return the int 'j' which is overloaded to denote the span of the phrase being applied */ public int LastSourceIndex() { return j; diff --git a/src/joshua/decoder/phrase/Note.java b/src/main/java/org/apache/joshua/decoder/phrase/Note.java similarity index 96% rename from src/joshua/decoder/phrase/Note.java rename to src/main/java/org/apache/joshua/decoder/phrase/Note.java index 19e6f628..15b0057e 100644 --- a/src/joshua/decoder/phrase/Note.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/Note.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; // PORT: done diff --git a/src/joshua/decoder/phrase/PhraseChart.java b/src/main/java/org/apache/joshua/decoder/phrase/PhraseChart.java similarity index 77% rename from src/joshua/decoder/phrase/PhraseChart.java rename to src/main/java/org/apache/joshua/decoder/phrase/PhraseChart.java index a0179ff5..9803d9b0 100644 --- a/src/joshua/decoder/phrase/PhraseChart.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/PhraseChart.java @@ -16,17 +16,19 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import joshua.decoder.Decoder; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.RuleCollection; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.RuleCollection; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class represents a bundle of phrase tables that have been read in, @@ -34,6 +36,8 @@ */ public class PhraseChart { + private static final Logger LOG = LoggerFactory.getLogger(PhraseChart.class); + private int sentence_length; private int max_source_phrase_length; @@ -49,8 +53,10 @@ public class PhraseChart { * applicable against the current input sentence. These phrases are extracted * from all available grammars. * - * @param tables - * @param source + * @param tables input array of {@link org.apache.joshua.decoder.phrase.PhraseTable}'s + * @param features {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s + * @param source input to {@link org.apache.joshua.lattice.Lattice} + * @param num_options number of translation options (typically set to 20) */ public PhraseChart(PhraseTable[] tables, List features, Sentence source, int num_options) { @@ -92,18 +98,18 @@ public PhraseChart(PhraseTable[] tables, List features, Sentenc phrases.finish(features, Decoder.weights, num_options); } - Decoder.LOG(1, String.format("Input %d: Collecting options took %.3f seconds", source.id(), - (System.currentTimeMillis() - startTime) / 1000.0f)); + LOG.info("Input {}: Collecting options took {} seconds", source.id(), + (System.currentTimeMillis() - startTime) / 1000.0f); - if (Decoder.VERBOSE(3)) { + if (LOG.isDebugEnabled()) { for (int i = 1; i < sentence_length - 1; i++) { for (int j = i + 1; j < sentence_length && j <= i + max_source_phrase_length; j++) { if (source.hasPath(i, j)) { TargetPhrases phrases = getRange(i, j); if (phrases != null) { - System.err.println(String.format("%s (%d-%d)", source.source(i,j), i, j)); + LOG.debug("{} ({}-{})", source.source(i,j), i, j); for (Rule rule: phrases) - System.err.println(String.format(" %s :: est=%.3f", rule.getEnglishWords(), rule.getEstimatedCost())); + LOG.debug(" {} :: est={}", rule.getEnglishWords(), rule.getEstimatedCost()); } } } @@ -123,8 +129,8 @@ public int MaxSourcePhraseLength() { /** * Maps two-dimensional span into a one-dimensional array. * - * @param i - * @param j + * @param i beginning of span + * @param j end of span * @return offset into private list of TargetPhrases */ private int offset(int i, int j) { @@ -134,9 +140,9 @@ private int offset(int i, int j) { /** * Returns phrases from all grammars that match the span. * - * @param begin - * @param end - * @return + * @param begin beginning of span + * @param end end of span + * @return the {@link org.apache.joshua.decoder.phrase.TargetPhrases} at the specified position in this list. */ public TargetPhrases getRange(int begin, int end) { int index = offset(begin, end); @@ -156,9 +162,9 @@ public TargetPhrases getRange(int begin, int end) { /** * Add a set of phrases from a grammar to the current span. * - * @param begin - * @param end - * @param to + * @param begin beginning of span + * @param end end of span + * @param to a {@link org.apache.joshua.decoder.ff.tm.RuleCollection} to be used in scoring and sorting. */ private void addToRange(int begin, int end, RuleCollection to) { if (to != null) { @@ -183,8 +189,8 @@ private void addToRange(int begin, int end, RuleCollection to) { else entries.get(offset).addAll(rules); } catch (java.lang.IndexOutOfBoundsException e) { - System.err.println(String.format("Whoops! %s [%d-%d] too long (%d)", to, begin, end, - entries.size())); + LOG.error("Whoops! {} [{}-{}] too long ({})", to, begin, end, entries.size()); + LOG.error(e.getMessage(), e); } } } diff --git a/src/joshua/decoder/phrase/PhraseTable.java b/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java similarity index 82% rename from src/joshua/decoder/phrase/PhraseTable.java rename to src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java index bcf7135f..733e1e1f 100644 --- a/src/joshua/decoder/phrase/PhraseTable.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java @@ -16,21 +16,21 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; import java.io.File; import java.io.IOException; import java.util.List; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.tm.Grammar; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.RuleCollection; -import joshua.decoder.ff.tm.Trie; -import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; -import joshua.decoder.ff.tm.packed.PackedGrammar; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.tm.Grammar; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.RuleCollection; +import org.apache.joshua.decoder.ff.tm.Trie; +import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar; +import org.apache.joshua.decoder.ff.tm.packed.PackedGrammar; /** * Represents a phrase table, and is implemented as a wrapper around either a {@link PackedGrammar} @@ -47,12 +47,13 @@ public class PhraseTable implements Grammar { * Chain to the super with a number of defaults. For example, we only use a single nonterminal, * and there is no span limit. * - * @param grammarFile - * @param owner - * @param config - * @throws IOException + * @param grammarFile file path parent directory + * @param owner used to set phrase owners + * @param type the grammar specification keyword (e.g., "thrax" or "moses") + * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration} + * @throws IOException if there is an error reading the grammar file */ - public PhraseTable(String grammarFile, String owner, String type, JoshuaConfiguration config, int maxSource) + public PhraseTable(String grammarFile, String owner, String type, JoshuaConfiguration config) throws IOException { this.config = config; int spanLimit = 0; @@ -60,9 +61,9 @@ public PhraseTable(String grammarFile, String owner, String type, JoshuaConfigur if (grammarFile != null && new File(grammarFile).isDirectory()) { this.backend = new PackedGrammar(grammarFile, spanLimit, owner, type, config); if (this.backend.getMaxSourcePhraseLength() == -1) { - System.err.println("FATAL: Using a packed grammar for a phrase table backend requires that you"); - System.err.println(" packed the grammar with Joshua 6.0.2 or greater"); - System.exit(-1); + String msg = "FATAL: Using a packed grammar for a phrase table backend requires that you " + + "packed the grammar with Joshua 6.0.2 or greater"; + throw new RuntimeException(msg); } } else { @@ -81,7 +82,7 @@ public PhraseTable(String owner, JoshuaConfiguration config) { * since the grammar includes the nonterminal. For {@link PackedGrammar}s, the value was either * in the packed config file (Joshua 6.0.2+) or was passed in via the TM config line. * - * @return + * @return the longest source phrase read. */ @Override public int getMaxSourcePhraseLength() { diff --git a/src/joshua/decoder/phrase/Stack.java b/src/main/java/org/apache/joshua/decoder/phrase/Stack.java similarity index 71% rename from src/joshua/decoder/phrase/Stack.java rename to src/main/java/org/apache/joshua/decoder/phrase/Stack.java index 88b529a4..2c2e15e3 100644 --- a/src/joshua/decoder/phrase/Stack.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/Stack.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; import java.util.ArrayList; import java.util.Collections; @@ -26,18 +26,22 @@ import java.util.PriorityQueue; import java.util.Set; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.ComputeNodeResult; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.ComputeNodeResult; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Organizes all hypotheses containing the same number of source words. * */ public class Stack extends ArrayList { - + + private static final Logger LOG = LoggerFactory.getLogger(Stack.class); + private static final long serialVersionUID = 7885252799032416068L; private HashMap> coverages; @@ -58,9 +62,9 @@ public class Stack extends ArrayList { /** * Create a new stack. Stacks are organized one for each number of source words that are covered. * - * @param featureFunctions - * @param sentence - * @param config + * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s + * @param sentence input for a {@link org.apache.joshua.lattice.Lattice} + * @param config populated {@link org.apache.joshua.decoder.JoshuaConfiguration} */ public Stack(List featureFunctions, Sentence sentence, JoshuaConfiguration config) { this.featureFunctions = featureFunctions; @@ -76,6 +80,8 @@ public Stack(List featureFunctions, Sentence sentence, JoshuaCo /** * A Stack is an ArrayList; here, we intercept the add so we can maintain a list of the items * stored under each distinct coverage vector + * @param hyp a {@link org.apache.joshua.decoder.phrase.Hypothesis} to add to the {@link org.apache.joshua.decoder.phrase.Stack} + * @return true if the {@link org.apache.joshua.decoder.phrase.Hypothesis} is appended to the list */ @Override public boolean add(Hypothesis hyp) { @@ -106,6 +112,7 @@ public boolean remove(Object obj) { /** * Returns the set of coverages contained in this stack. This is used to iterate over them * in the main decoding loop in Stacks.java. + * @return a {@link java.util.Set} of {@link org.apache.joshua.decoder.phrase.Coverage}'s */ public Set getCoverages() { return coverages.keySet(); @@ -114,8 +121,8 @@ public Set getCoverages() { /** * Get all items with the same coverage vector. * - * @param cov - * @return + * @param cov the {@link org.apache.joshua.decoder.phrase.Coverage} vector to get + * @return an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.phrase.Hypothesis}' */ public ArrayList get(Coverage cov) { ArrayList list = coverages.get(cov); @@ -126,15 +133,14 @@ public ArrayList get(Coverage cov) { /** * Receives a partially-initialized translation candidate and places it on the * priority queue after scoring it with all of the feature functions. In this - * respect it is like {@link CubePruneState} (it could make use of that class with + * respect it is like {@link org.apache.joshua.decoder.chart_parser.CubePruneState} (it could make use of that class with * a little generalization of spans / coverage). * * This function is also used to (fairly concisely) implement constrained decoding. Before * adding a candidate, we ensure that the sequence of English words match the sentence. If not, * the code extends the dot in the cube-pruning chart to the next phrase, since that one might * be a match. - * - * @param cand + * @param cand a partially-initialized translation {@link org.apache.joshua.decoder.phrase.Candidate} */ public void addCandidate(Candidate cand) { if (visitedStates.contains(cand)) @@ -168,17 +174,14 @@ public void addCandidate(Candidate cand) { * Cube pruning. Repeatedly pop the top candidate, creating a new hyperedge from it, adding it to * the k-best list, and then extending the list of candidates with extensions of the current * candidate. - * - * @param context - * @param output */ public void search() { int to_pop = config.pop_limit; - if (Decoder.VERBOSE >= 3) { - System.err.println("Stack::search(): pop: " + to_pop + " size: " + candidates.size()); + if (LOG.isDebugEnabled()) { + LOG.debug("Stack::search(): pop: {} size: {}", to_pop, candidates.size()); for (Candidate c: candidates) - System.err.println(" " + c); + LOG.debug("{}", c); } while (to_pop > 0 && !candidates.isEmpty()) { Candidate got = candidates.poll(); @@ -197,38 +200,31 @@ public void search() { /** * Adds a popped candidate to the chart / main stack. This is a candidate we have decided to * keep around. + * @param complete a completely-initialized translation {@link org.apache.joshua.decoder.phrase.Candidate} * */ public void addHypothesis(Candidate complete) { Hypothesis added = new Hypothesis(complete); - + + String taskName; if (deduper.containsKey(added)) { + taskName = "recombining hypothesis"; Hypothesis existing = deduper.get(added); existing.absorb(added); - - if (Decoder.VERBOSE >= 3) { - System.err.println(String.format("recombining hypothesis from ( ... %s )", complete.getHypothesis().getRule().getEnglishWords())); - System.err.println(String.format(" base score %.3f", complete.getResult().getBaseCost())); - System.err.println(String.format(" covering %d-%d", complete.getSpan().start - 1, complete.getSpan().end - 2)); - System.err.println(String.format(" translated as: %s", complete.getRule().getEnglishWords())); - System.err.println(String.format(" score %.3f + future cost %.3f = %.3f", - complete.getResult().getTransitionCost(), complete.getFutureEstimate(), - complete.getResult().getTransitionCost() + complete.getFutureEstimate())); - } - } else { + taskName = "creating new hypothesis"; add(added); deduper.put(added, added); - - if (Decoder.VERBOSE >= 3) { - System.err.println(String.format("creating new hypothesis from ( ... %s )", complete.getHypothesis().getRule().getEnglishWords())); - System.err.println(String.format(" base score %.3f", complete.getResult().getBaseCost())); - System.err.println(String.format(" covering %d-%d", complete.getSpan().start - 1, complete.getSpan().end - 2)); - System.err.println(String.format(" translated as: %s", complete.getRule().getEnglishWords())); - System.err.println(String.format(" score %.3f + future cost %.3f = %.3f", - complete.getResult().getTransitionCost(), complete.getFutureEstimate(), - complete.getResult().getTransitionCost() + complete.getFutureEstimate())); - } + } + + if (LOG.isDebugEnabled()) { + LOG.debug("{} from ( ... {} )", taskName, complete.getHypothesis().getRule().getEnglishWords()); + LOG.debug(" base score {}", complete.getResult().getBaseCost()); + LOG.debug(" covering {}-{}", complete.getSpan().start - 1, complete.getSpan().end - 2); + LOG.debug(" translated as: {}", complete.getRule().getEnglishWords()); + LOG.debug(" score {} + future cost {} = {}", + complete.getResult().getTransitionCost(), complete.getFutureEstimate(), + complete.getResult().getTransitionCost() + complete.getFutureEstimate()); } } } diff --git a/src/joshua/decoder/phrase/Stacks.java b/src/main/java/org/apache/joshua/decoder/phrase/Stacks.java similarity index 85% rename from src/joshua/decoder/phrase/Stacks.java rename to src/main/java/org/apache/joshua/decoder/phrase/Stacks.java index eda7d8b1..adb0b512 100644 --- a/src/joshua/decoder/phrase/Stacks.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/Stacks.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; /*** * Entry point for phrase-based decoding, analogous to {@link Chart} for the CKY algorithm. This @@ -32,26 +32,30 @@ * ensures that the coverage vector is consistent but the resulting hypergraph may not be projective, * which is different from the CKY algorithm, which does produce projective derivations. * - * Lattice decoding is not yet supported (March 2015). + * TODO Lattice decoding is not yet supported (March 2015). */ import java.util.ArrayList; import java.util.List; -import joshua.corpus.Span; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.chart_parser.ComputeNodeResult; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.tm.AbstractGrammar; -import joshua.decoder.ff.tm.Grammar; -import joshua.decoder.hypergraph.HGNode; -import joshua.decoder.hypergraph.HyperEdge; -import joshua.decoder.hypergraph.HyperGraph; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Span; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.chart_parser.ComputeNodeResult; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.tm.AbstractGrammar; +import org.apache.joshua.decoder.ff.tm.Grammar; +import org.apache.joshua.decoder.hypergraph.HGNode; +import org.apache.joshua.decoder.hypergraph.HyperEdge; +import org.apache.joshua.decoder.hypergraph.HyperGraph; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class Stacks { + private static final Logger LOG = LoggerFactory.getLogger(Stacks.class); + // The list of stacks, grouped according to number of source words covered private List stacks; @@ -71,10 +75,10 @@ public class Stacks { * Entry point. Initialize everything. Create pass-through (OOV) phrase table and glue phrase * table (with start-of-sentence and end-of-sentence rules). * - * @param sentence - * @param featureFunctions - * @param grammars - * @param config + * @param sentence input to {@link org.apache.joshua.lattice.Lattice} + * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s + * @param grammars an array of {@link org.apache.joshua.decoder.ff.tm.Grammar}'s + * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration} */ public Stacks(Sentence sentence, List featureFunctions, Grammar[] grammars, JoshuaConfiguration config) { @@ -106,7 +110,7 @@ public Stacks(Sentence sentence, List featureFunctions, Grammar /** * The main algorithm. Returns a hypergraph representing the search space. * - * @return + * @return a {@link org.apache.joshua.decoder.hypergraph.HyperGraph} representing the search space */ public HyperGraph search() { @@ -135,10 +139,9 @@ public HyperGraph search() { phrase_length++) { int from_stack = source_words - phrase_length; Stack tailStack = stacks.get(from_stack); - - if (Decoder.VERBOSE >= 3) - System.err.println(String.format("\n WORDS %d MAX %d (STACK %d phrase_length %d)", source_words, - chart.MaxSourcePhraseLength(), from_stack, phrase_length)); + + LOG.debug("WORDS {} MAX {} (STACK {} phrase_length {})", source_words, + chart.MaxSourcePhraseLength(), from_stack, phrase_length); // Iterate over antecedents in this stack. for (Coverage coverage: tailStack.getCoverages()) { @@ -167,8 +170,9 @@ public HyperGraph search() { if (phrases == null) continue; - if (Decoder.VERBOSE >= 3) - System.err.println(String.format(" Applying %d target phrases over [%d,%d]", phrases.size(), begin, begin + phrase_length)); + + LOG.debug("Applying {} target phrases over [{}, {}]", + phrases.size(), begin, begin + phrase_length); // TODO: could also compute some number of features here (e.g., non-LM ones) // float score_delta = context.GetScorer().transition(ant, phrases, begin, begin + phrase_length); @@ -199,8 +203,8 @@ public HyperGraph search() { targetStack.search(); } - Decoder.LOG(1, String.format("Input %d: Search took %.3f seconds", sentence.id(), - (System.currentTimeMillis() - startTime) / 1000.0f)); + LOG.info("Input {}: Search took {} seconds", sentence.id(), + (System.currentTimeMillis() - startTime) / 1000.0f); return createGoalNode(); } diff --git a/src/joshua/decoder/phrase/TargetPhrases.java b/src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java similarity index 81% rename from src/joshua/decoder/phrase/TargetPhrases.java rename to src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java index 83b69d0e..05a4b0a9 100644 --- a/src/joshua/decoder/phrase/TargetPhrases.java +++ b/src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java @@ -16,15 +16,15 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import joshua.decoder.ff.FeatureFunction; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.FeatureFunction; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.tm.Rule; /** * Represents a sorted collection of target-side phrases. Typically, these are phrases @@ -45,7 +45,7 @@ public TargetPhrases() { /** * Initialize with a collection of rules. * - * @param list + * @param list a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.tm.Rule}'s */ public TargetPhrases(List list) { super(); @@ -59,6 +59,9 @@ public TargetPhrases(List list) { * Score the rules and sort them. Scoring is necessary because rules are only scored if they * are used, in an effort to make reading in rules more efficient. This is starting to create * some trouble and should probably be reworked. + * @param features a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s + * @param weights a populated {@link org.apache.joshua.decoder.ff.FeatureVector} + * @param num_options the number of options */ public void finish(List features, FeatureVector weights, int num_options) { for (Rule rule: this) { diff --git a/src/joshua/decoder/segment_file/ConstraintRule.java b/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java similarity index 77% rename from src/joshua/decoder/segment_file/ConstraintRule.java rename to src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java index 9968640f..5146e2c6 100644 --- a/src/joshua/decoder/segment_file/ConstraintRule.java +++ b/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java @@ -16,50 +16,52 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.segment_file; +package org.apache.joshua.decoder.segment_file; import javax.swing.text.Segment; - /** * This interface is for an individual (partial) item to seed the chart with. All rules should be * flat (no hierarchical nonterminals). *

    * The {@link Segment}, {@link ConstraintSpan}, and {@link ConstraintRule} interfaces are for * defining an interchange format between a SegmentFileParser and the Chart class. These interfaces - * should not be used internally by the Chart. The objects returned by a + * should not be used internally by the Chart. The objects returned by a * SegmentFileParser will not be optimal for use during decoding. The Chart should convert each of * these objects into its own internal representation during construction. That is the contract * described by these interfaces. * - * @see Type + * @see org.apache.joshua.decoder.segment_file.ConstraintRule.Type * - * @author wren ng thornton + * @author wren ng thornton wren@users.sourceforge.net * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $ */ public interface ConstraintRule { /** - * There are three types of ConstraintRule. The RULE type returns non-null values for all methods. + *

    There are three types of ConstraintRule. The RULE type returns non-null values for all methods. * The LHS type provides a (non-null) value for the lhs method, but returns null for everything * else. And the RHS type provides a (non-null) value for nativeRhs and foreignRhs but returns - * null for the lhs and features. + * null for the lhs and features.

    *

    * The interpretation of a RULE is that it adds a new rule to the grammar which only applies to * the associated span. If the associated span is hard, then the set of rules for that span will - * override the regular grammar. + * override the regular grammar.

    *

    * The intepretation of a LHS is that it provides a hard constraint that the associated span be - * treated as the nonterminal for that span, thus filtering the regular grammar. + * treated as the nonterminal for that span, thus filtering the regular grammar.

    *

    * The interpretation of a RHS is that it provides a hard constraint to filter the regular grammar - * such that only rules generating the desired translation can be used. + * such that only rules generating the desired translation can be used.

    */ public enum Type { RULE, LHS, RHS }; - /** Return the type of this ConstraintRule. */ + /** + * Return the type of this ConstraintRule. + * @return the {@link org.apache.joshua.decoder.segment_file.ConstraintRule.Type} + */ Type type(); @@ -67,6 +69,7 @@ public enum Type { * Return the left hand side of the constraint rule. If this is null, then this object is * specifying a translation for the span, but that translation may be derived from any * nonterminal. The nonterminal here must be one used by the regular grammar. + * @return the left hand side of the constraint rule */ String lhs(); @@ -74,6 +77,7 @@ public enum Type { /** * Return the native right hand side of the constraint rule. If this is null, then the regular * grammar will be used to fill in the derivation from the lhs. + * @return the native right hand side of the constraint rule */ String nativeRhs(); @@ -81,6 +85,7 @@ public enum Type { /** * Return the foreign right hand side of the constraint rule. This must be consistent with the * sentence for the associated span, and is provided as a convenience method. + * @return the foreign right hand side of the constraint rule */ String foreignRhs(); @@ -88,7 +93,8 @@ public enum Type { /** * Return the grammar feature values for the RULE. The length of this array must be the same as * for the regular grammar. We cannot enforce this requirement, but the - * {@link joshua.decoder.chart_parser.Chart} must throw an error if there is a mismatch. + * {@link org.apache.joshua.decoder.chart_parser.Chart} must throw an error if there is a mismatch. + * @return an array of floating feature values for the RULE */ float[] features(); } diff --git a/src/joshua/decoder/segment_file/ConstraintSpan.java b/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java similarity index 86% rename from src/joshua/decoder/segment_file/ConstraintSpan.java rename to src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java index c8087bd1..9863fa6a 100644 --- a/src/joshua/decoder/segment_file/ConstraintSpan.java +++ b/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.segment_file; +package org.apache.joshua.decoder.segment_file; import java.util.List; @@ -38,29 +38,32 @@ *

    * The {@link Segment}, {@link ConstraintSpan}, and {@link ConstraintRule} interfaces are for * defining an interchange format between a SegmentFileParser and the Chart class. These interfaces - * should not be used internally by the Chart. The objects returned by a + * should not be used internally by the Chart. The objects returned by a * SegmentFileParser will not be optimal for use during decoding. The Chart should convert each of * these objects into its own internal representation during construction. That is the contract * described by these interfaces. * - * @author wren ng thornton + * @author wren ng thornton wren@users.sourceforge.net */ public interface ConstraintSpan { /** * Return the starting index of the span covered by this constraint. + * @return the starting index of the span covered by this constraint */ int start(); /** * Return the ending index of the span covered by this constraint. Clients may assume * this.end() >= 1 + this.start(). + * @return the ending index of the span covered by this constraint */ int end(); /** * Return whether this is a hard constraint which should override the grammar. This value only * really matters for sets of RULE type constraints. + * @return true if a hard constraint exists which should override the grammar */ boolean isHard(); @@ -71,6 +74,7 @@ public interface ConstraintSpan { * {@link java.util.Iterator} instead in order to reduce the coupling between this class and * Chart. See the note above about the fact that this interface should not be used internally by * the Chart class because it will not be performant. + * @return a {@link java.util.List} of {@link org.apache.joshua.decoder.segment_file.ConstraintRule}'s */ List rules(); } diff --git a/src/joshua/decoder/segment_file/ParseTreeInput.java b/src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java similarity index 92% rename from src/joshua/decoder/segment_file/ParseTreeInput.java rename to src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java index 5feb0514..b9b1896c 100644 --- a/src/joshua/decoder/segment_file/ParseTreeInput.java +++ b/src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java @@ -16,9 +16,9 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.segment_file; +package org.apache.joshua.decoder.segment_file; -import joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.JoshuaConfiguration; public class ParseTreeInput extends Sentence { diff --git a/src/joshua/decoder/segment_file/ParsedSentence.java b/src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java similarity index 87% rename from src/joshua/decoder/segment_file/ParsedSentence.java rename to src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java index 9273b96b..a97718e3 100644 --- a/src/joshua/decoder/segment_file/ParsedSentence.java +++ b/src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java @@ -16,12 +16,12 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.segment_file; +package org.apache.joshua.decoder.segment_file; -import joshua.corpus.Vocabulary; -import joshua.corpus.syntax.ArraySyntaxTree; -import joshua.corpus.syntax.SyntaxTree; -import joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.corpus.syntax.ArraySyntaxTree; +import org.apache.joshua.corpus.syntax.SyntaxTree; +import org.apache.joshua.decoder.JoshuaConfiguration; public class ParsedSentence extends Sentence { diff --git a/src/joshua/decoder/segment_file/Sentence.java b/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java similarity index 88% rename from src/joshua/decoder/segment_file/Sentence.java rename to src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java index 588850ba..785469d7 100644 --- a/src/joshua/decoder/segment_file/Sentence.java +++ b/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java @@ -16,10 +16,9 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.segment_file; +package org.apache.joshua.decoder.segment_file; -import static joshua.util.FormatUtils.addSentenceMarkers; -import static joshua.util.FormatUtils.escapeSpecialSymbols; +import static org.apache.joshua.util.FormatUtils.addSentenceMarkers; import java.util.ArrayList; import java.util.HashSet; @@ -30,27 +29,33 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.tm.Grammar; -import joshua.lattice.Arc; -import joshua.lattice.Lattice; -import joshua.lattice.Node; -import joshua.util.ChartSpan; -import joshua.util.Regex; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.tm.Grammar; +import org.apache.joshua.lattice.Arc; +import org.apache.joshua.lattice.Lattice; +import org.apache.joshua.lattice.Node; +import org.apache.joshua.util.ChartSpan; +import org.apache.joshua.util.Regex; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class represents lattice input. The lattice is contained on a single line and is represented * in PLF (Python Lattice Format), e.g., * + *

      * ((('ein',0.1,1),('dieses',0.2,1),('haus',0.4,2),),(('haus',0.8,1),),)
    + * 
    * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class Sentence { + private static final Logger LOG = LoggerFactory.getLogger(Sentence.class); + /* The sentence number. */ public int id = -1; @@ -77,8 +82,9 @@ public class Sentence { * Constructor. Receives a string representing the input sentence. This string may be a * string-encoded lattice or a plain text string for decoding. * - * @param inputString - * @param id + * @param inputString representing the input sentence + * @param id ID to associate with the input string + * @param joshuaConfiguration a populated {@link org.apache.joshua.decoder.JoshuaConfiguration} */ public Sentence(String inputString, int id, JoshuaConfiguration joshuaConfiguration) { @@ -135,7 +141,7 @@ public boolean isLinearChain() { /** * Returns the length of the sentence. For lattices, the length is the shortest path through the - * lattice. The length includes the and sentence markers. + * lattice. The length includes the <s> and </s> sentence markers. * * @return number of input tokens + 2 (for start and end of sentence markers) */ @@ -255,20 +261,19 @@ public void segmentOOVs(Grammar[] grammars) { } /** - * If the input sentence is too long (not counting the and tokens), it is truncated to + * If the input sentence is too long (not counting the <s> and </s> tokens), it is truncated to * the maximum length, specified with the "maxlen" parameter. * * Note that this code assumes the underlying representation is a sentence, and not a lattice. Its * behavior is undefined for lattices. * - * @param length + * @param length int representing the length to truncate the sentence to */ protected void adjustForLength(int length) { int size = this.getLattice().size() - 2; // subtract off the start- and end-of-sentence tokens if (size > length) { - Decoder.LOG(1, String.format("* WARNING: sentence %d too long (%d), truncating to length %d", - id(), size, length)); + LOG.warn("sentence {} too long {}, truncating to length {}", id(), size, length); // Replace the input sentence (and target) -- use the raw string, not source() String[] tokens = source.split("\\s+"); @@ -292,6 +297,7 @@ public int id() { /** * Returns the raw source-side input string. + * @return the raw source-side input string */ public String rawSource() { return source; @@ -300,7 +306,7 @@ public String rawSource() { /** * Returns the source-side string with annotations --- if any --- stripped off. * - * @return + * @return the source-side string with annotations --- if any --- stripped off */ public String source() { StringBuilder str = new StringBuilder(); @@ -332,7 +338,7 @@ public String fullSource() { * * If the parameter parse=true is set, parsing will be triggered, otherwise constrained decoding. * - * @return + * @return target side of sentence translation */ public String target() { return target; @@ -368,7 +374,7 @@ public String[] references() { * Returns the sequence of tokens comprising the sentence. This assumes you've done the checking * to makes sure the input string (the source side) isn't a PLF waiting to be parsed. * - * @return + * @return a {@link java.util.List} of {@link org.apache.joshua.decoder.segment_file.Token}'s comprising the sentence */ public List getTokens() { assert isLinearChain(); @@ -382,6 +388,7 @@ public List getTokens() { /** * Returns the sequence of word IDs comprising the input sentence. Assumes this is not a general * lattice, but a linear chain. + * @return an int[] comprising all word ID's */ public int[] getWordIDs() { List tokens = getTokens(); @@ -395,7 +402,7 @@ public int[] getWordIDs() { * Returns the sequence of word ids comprising the sentence. Assumes this is a sentence and * not a lattice. * - * @return + * @return the sequence of word ids comprising the sentence */ public Lattice stringLattice() { assert isLinearChain(); @@ -410,8 +417,7 @@ public Lattice getLattice() { if (this.sourceLattice == null) { if (config.lattice_decoding && rawSource().startsWith("(((")) { if (config.search_algorithm.equals("stack")) { - System.err.println("* FATAL: lattice decoding currently not supported for stack-based search algorithm."); - System.exit(12); + throw new RuntimeException("* FATAL: lattice decoding currently not supported for stack-based search algorithm."); } this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource(), config); } else diff --git a/src/joshua/decoder/segment_file/Token.java b/src/main/java/org/apache/joshua/decoder/segment_file/Token.java similarity index 76% rename from src/joshua/decoder/segment_file/Token.java rename to src/main/java/org/apache/joshua/decoder/segment_file/Token.java index bddfd68e..3aa02ef2 100644 --- a/src/joshua/decoder/segment_file/Token.java +++ b/src/main/java/org/apache/joshua/decoder/segment_file/Token.java @@ -16,18 +16,20 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.segment_file; +package org.apache.joshua.decoder.segment_file; -import static joshua.util.FormatUtils.escapeSpecialSymbols; +import static org.apache.joshua.util.FormatUtils.escapeSpecialSymbols; import java.util.HashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.util.FormatUtils; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.util.FormatUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Stores the identity of a word and its annotations in a sentence. @@ -36,6 +38,9 @@ * @author Matt Post */ public class Token { + + private static final Logger LOG = LoggerFactory.getLogger(Token.class); + // The token without the annotations private String token; private int tokenID; @@ -44,24 +49,30 @@ public class Token { private JoshuaConfiguration joshuaConfiguration; /** - * Constructor : Creates a Token object from a raw word + *

    Constructor : Creates a Token object from a raw word * Extracts and assigns an annotation when available. * Any word can be marked with annotations, which are arbitrary semicolon-delimited - * key[=value] pairs (the value is optional) listed in brackets after a word, e.g., + * key[=value] pairs (the value is optional) listed in brackets after a word, e.g.,

    + *
    +   *    Je[ref=Samuel;PRO] voudrais[FUT;COND]
    +   * 
    * - * Je[ref=Samuel;PRO] voudrais[FUT;COND] ... + *

    This will create a dictionary annotation on the word of the following form for "Je"

    * - * This will create a dictionary annotation on the word of the following form for "Je" + *
    +   *   ref -> Samuel
    +   *   PRO -> PRO
    +   * 
    * - * ref -> Samuel - * PRO -> PRO - * - * and the following for "voudrais": + *

    and the following for "voudrais":

    * - * FUT -> FUT - * COND -> COND + *
    +   *   FUT  -> FUT
    +   *   COND -> COND
    +   * 
    * * @param rawWord A word with annotation information (possibly) + * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration} * */ public Token(String rawWord, JoshuaConfiguration config) { @@ -104,7 +115,7 @@ else if (Character.isUpperCase(token.charAt(0))) else annotations.put("lettercase", "lower"); - Decoder.LOG(2, String.format("TOKEN: %s -> %s (%s)", token, token.toLowerCase(), annotations.get("lettercase"))); + LOG.info("TOKEN: {} -> {} ({})", token, token.toLowerCase(), annotations.get("lettercase")); token = token.toLowerCase(); } @@ -135,7 +146,8 @@ public String toString() { /** * Returns the annotationID (vocab ID) * associated with this token - * @return int A type ID + * @param key A type ID + * @return the annotationID (vocab ID) */ public String getAnnotation(String key) { if (annotations.containsKey(key)) { diff --git a/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java b/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java new file mode 100644 index 00000000..a615030b --- /dev/null +++ b/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * Provides common interfaces for parsing segment files + * (aka test corpora to be translated). In order to support + * constraint annotations, we provide a general API for + * use by JoshuaDecoder and Chart. + */ +package org.apache.joshua.decoder.segment_file; diff --git a/src/joshua/lattice/Arc.java b/src/main/java/org/apache/joshua/lattice/Arc.java similarity index 98% rename from src/joshua/lattice/Arc.java rename to src/main/java/org/apache/joshua/lattice/Arc.java index 793a128c..5d056ab6 100644 --- a/src/joshua/lattice/Arc.java +++ b/src/main/java/org/apache/joshua/lattice/Arc.java @@ -16,8 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.lattice; - +package org.apache.joshua.lattice; /** * An arc in a directed graph. diff --git a/src/joshua/lattice/Lattice.java b/src/main/java/org/apache/joshua/lattice/Lattice.java similarity index 80% rename from src/joshua/lattice/Lattice.java rename to src/main/java/org/apache/joshua/lattice/Lattice.java index b0ef40f5..23321592 100644 --- a/src/joshua/lattice/Lattice.java +++ b/src/main/java/org/apache/joshua/lattice/Lattice.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.lattice; +package org.apache.joshua.lattice; import java.util.ArrayList; import java.util.Collections; @@ -25,26 +25,28 @@ import java.util.List; import java.util.Map; import java.util.Stack; -import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.segment_file.Token; -import joshua.util.ChartSpan; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.segment_file.Token; +import org.apache.joshua.util.ChartSpan; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A lattice representation of a directed graph. - * + * * @author Lane Schwartz - * @author Matt Post + * @author Matt Post post@cs.jhu.edu * @since 2008-07-08 - * - * @param Label Type of label associated with an arc. + * */ public class Lattice implements Iterable> { + private static final Logger LOG = LoggerFactory.getLogger(Lattice.class); + /** * True if there is more than one path through the lattice. */ @@ -60,9 +62,7 @@ public class Lattice implements Iterable> { */ private List> nodes; - /** Logger for this class. */ - private static final Logger logger = Logger.getLogger(Lattice.class.getName()); - + JoshuaConfiguration config = null; /** @@ -70,12 +70,13 @@ public class Lattice implements Iterable> { *

    * The list of nodes must already be in topological order. If the list is not in topological * order, the behavior of the lattice is not defined. - * + * * @param nodes A list of nodes which must be in topological order. + * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration} */ public Lattice(List> nodes, JoshuaConfiguration config) { this.nodes = nodes; -// this.distances = calculateAllPairsShortestPath(); + // this.distances = calculateAllPairsShortestPath(); this.latticeHasAmbiguity = true; } @@ -83,14 +84,15 @@ public Lattice(List> nodes, boolean isAmbiguous, JoshuaConfiguration // Node sink = new Node(nodes.size()); // nodes.add(sink); this.nodes = nodes; -// this.distances = calculateAllPairsShortestPath(); + // this.distances = calculateAllPairsShortestPath(); this.latticeHasAmbiguity = isAmbiguous; } /** * Instantiates a lattice from a linear chain of values, i.e., a sentence. - * + * * @param linearChain a sequence of Value objects + * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration} */ public Lattice(Value[] linearChain, JoshuaConfiguration config) { this.latticeHasAmbiguity = false; @@ -114,7 +116,7 @@ public Lattice(Value[] linearChain, JoshuaConfiguration config) { i++; } -// this.distances = calculateAllPairsShortestPath(); + // this.distances = calculateAllPairsShortestPath(); } public final boolean hasMoreThanOnePath() { @@ -124,10 +126,9 @@ public final boolean hasMoreThanOnePath() { /** * Computes the shortest distance between two nodes, which is used (perhaps among other places) in * computing which rules can apply over which spans of the input - * - * @param tail - * @param head - * @return the distance, a positive number, or -1 if there is no path between the nodes + * + * @param arc an {@link org.apache.joshua.lattice.Arc} of values + * @return the shortest distance between two nodes */ public int distance(Arc arc) { return this.getShortestPath(arc.getTail().getNumber(), arc.getHead().getNumber()); @@ -139,8 +140,9 @@ public int distance(int i, int j) { /** * Convenience method to get a lattice from a linear sequence of {@link Token} objects. - * - * @param linearChain + * + * @param source input string from which to create a {@link org.apache.joshua.lattice.Lattice} + * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration} * @return Lattice representation of the linear chain. */ public static Lattice createTokenLatticeFromString(String source, JoshuaConfiguration config) { @@ -155,7 +157,7 @@ public static Lattice createTokenLatticeFromString(String source, JoshuaC public static Lattice createTokenLatticeFromPLF(String data, JoshuaConfiguration config) { ArrayList> nodes = new ArrayList>(); - + // This matches a sequence of tuples, which describe arcs leaving this node Pattern nodePattern = Pattern.compile("(.+?)\\(\\s*(\\(.+?\\),\\s*)\\s*\\)(.*)"); @@ -242,8 +244,9 @@ public static Lattice createTokenLatticeFromPLF(String data, JoshuaConfig /** * Constructs a lattice from a given string representation. - * + * * @param data String representation of a lattice. + * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration} * @return A lattice that corresponds to the given string. */ public static Lattice createStringLatticeFromString(String data, JoshuaConfiguration config) { @@ -272,7 +275,7 @@ public static Lattice createStringLatticeFromString(String data, JoshuaC nodes.put(nodeID, currentNode); } - logger.fine("Node " + nodeID + ":"); + LOG.debug("Node : {}", nodeID); Matcher arcMatcher = arcPattern.matcher(nodeData); @@ -291,7 +294,7 @@ public static Lattice createStringLatticeFromString(String data, JoshuaC String remainingArcs = arcMatcher.group(4); - logger.fine("\t" + arcLabel + " " + arcWeight + " " + destinationNodeID); + LOG.debug("\t{} {} {}", arcLabel, arcWeight, destinationNodeID); currentNode.addArc(destinationNode, arcWeight, arcLabel); @@ -304,14 +307,14 @@ public static Lattice createStringLatticeFromString(String data, JoshuaC List> nodeList = new ArrayList>(nodes.values()); Collections.sort(nodeList, new NodeIdentifierComparator()); - logger.fine(nodeList.toString()); + LOG.debug("Nodelist={}", nodeList); return new Lattice(nodeList, config); } /** * Gets the cost of the shortest path between two nodes. - * + * * @param from ID of the starting node. * @param to ID of the ending node. * @return The cost of the shortest path between the two nodes. @@ -320,13 +323,13 @@ public int getShortestPath(int from, int to) { // System.err.println(String.format("DISTANCE(%d,%d) = %f", from, to, costs[from][to])); if (distances == null) this.distances = calculateAllPairsShortestPath(); - + return distances.get(from, to); } /** * Gets the shortest distance through the lattice. - * + * @return int representing the shortest distance through the lattice */ public int getShortestDistance() { if (distances == null) @@ -338,7 +341,7 @@ public int getShortestDistance() { * Gets the node with a specified integer identifier. If the identifier is negative, we count * backwards from the end of the array, Perl-style (-1 is the last element, -2 the penultimate, * etc). - * + * * @param index Integer identifier for a node. * @return The node with the specified integer identifier */ @@ -355,7 +358,7 @@ public List> getNodes() { /** * Returns an iterator over the nodes in this lattice. - * + * * @return An iterator over the nodes in this lattice. */ public Iterator> iterator() { @@ -364,7 +367,7 @@ public Iterator> iterator() { /** * Returns the number of nodes in this lattice. - * + * * @return The number of nodes in this lattice. */ public int size() { @@ -376,7 +379,7 @@ public int size() { *

    * Note: This method assumes no backward arcs. If there are backward arcs, the returned shortest * path costs for that node may not be accurate. - * + * * @param nodes A list of nodes which must be in topological order. * @return The all-pairs shortest path for all pairs of nodes. */ @@ -442,35 +445,35 @@ public static void main(String[] args) { /** * Replaced the arc from node i to j with the supplied lattice. This is used to do OOV * segmentation of words in a lattice. - * - * @param i - * @param j - * @param lattice + * + * @param i start node of arc + * @param j end node of arc + * @param newNodes new nodes used within the replacement operation */ public void insert(int i, int j, List> newNodes) { - + nodes.get(i).setOutgoingArcs(newNodes.get(0).getOutgoingArcs()); - + newNodes.remove(0); nodes.remove(j); Collections.reverse(newNodes); - + for (Node node: newNodes) nodes.add(j, node); - + this.latticeHasAmbiguity = false; for (int x = 0; x < nodes.size(); x++) { nodes.get(x).setID(x); this.latticeHasAmbiguity |= (nodes.get(x).getOutgoingArcs().size() > 1); } - + this.distances = null; } /** * Topologically sorts the nodes and reassigns their numbers. Assumes that the first node is the * source, but otherwise assumes nothing about the input. - * + * * Probably correct, but untested. */ @SuppressWarnings("unused") @@ -481,35 +484,104 @@ private void topologicalSort() { ArrayList> arcs = new ArrayList>(); for (Arc arc: node.getOutgoingArcs()) { arcs.add(arc); - + if (! ingraph.containsKey(arc.getHead())) ingraph.put(arc.getHead(), new ArrayList>()); ingraph.get(arc.getHead()).add(arc); - + outgraph.put(node, arcs); } } - + ArrayList> sortedNodes = new ArrayList>(); Stack> stack = new Stack>(); stack.push(nodes.get(0)); - + while (! stack.empty()) { Node node = stack.pop(); sortedNodes.add(node); for (Arc arc: outgraph.get(node)) { outgraph.get(node).remove(arc); ingraph.get(arc.getHead()).remove(arc); - + if (ingraph.get(arc.getHead()).size() == 0) sortedNodes.add(arc.getHead()); } } - + int id = 0; for (Node node : sortedNodes) node.setID(id++); - + this.nodes = sortedNodes; } + + /** + * Constructs a lattice from a given string representation. + * + * @param data String representation of a lattice. + * @return A lattice that corresponds to the given string. + */ + public static Lattice createFromString(String data) { + + Map> nodes = new HashMap>(); + + Pattern nodePattern = Pattern.compile("(.+?)\\((\\(.+?\\),)\\)(.*)"); + Pattern arcPattern = Pattern.compile("\\('(.+?)',(\\d+.\\d+),(\\d+)\\),(.*)"); + + Matcher nodeMatcher = nodePattern.matcher(data); + + int nodeID = -1; + + while (nodeMatcher.matches()) { + + String nodeData = nodeMatcher.group(2); + String remainingData = nodeMatcher.group(3); + + nodeID++; + + Node currentNode; + if (nodes.containsKey(nodeID)) { + currentNode = nodes.get(nodeID); + } else { + currentNode = new Node(nodeID); + nodes.put(nodeID, currentNode); + } + + LOG.debug("Node : {}", nodeID); + + Matcher arcMatcher = arcPattern.matcher(nodeData); + + while (arcMatcher.matches()) { + String arcLabel = arcMatcher.group(1); + double arcWeight = Double.valueOf(arcMatcher.group(2)); + int destinationNodeID = nodeID + Integer.valueOf(arcMatcher.group(3)); + + Node destinationNode; + if (nodes.containsKey(destinationNodeID)) { + destinationNode = nodes.get(destinationNodeID); + } else { + destinationNode = new Node(destinationNodeID); + nodes.put(destinationNodeID, destinationNode); + } + + String remainingArcs = arcMatcher.group(4); + + LOG.debug("\t {} {} {}", arcLabel, arcWeight, destinationNodeID); + + currentNode.addArc(destinationNode, (float) arcWeight, arcLabel); + + arcMatcher = arcPattern.matcher(remainingArcs); + } + + nodeMatcher = nodePattern.matcher(remainingData); + } + + List> nodeList = new ArrayList>(nodes.values()); + Collections.sort(nodeList, new NodeIdentifierComparator()); + + LOG.debug("Nodelist={}", nodeList); + + return new Lattice(nodeList, new JoshuaConfiguration()); + } } diff --git a/src/joshua/lattice/Node.java b/src/main/java/org/apache/joshua/lattice/Node.java similarity index 98% rename from src/joshua/lattice/Node.java rename to src/main/java/org/apache/joshua/lattice/Node.java index 31dcea98..ecff22ee 100644 --- a/src/joshua/lattice/Node.java +++ b/src/main/java/org/apache/joshua/lattice/Node.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.lattice; +package org.apache.joshua.lattice; import java.util.ArrayList; import java.util.Iterator; @@ -55,6 +55,7 @@ public class Node

    + * When decoding with use_tree_nbest=true, instead of a flat text output like + * "i asked her a question", we get a Penn treebank format tree like + * "(ROOT (S (NP i) (VP (V asked) (NP her) (NP (DT a) (N question)))))". + * If we also set include_align_index=true, we include source-side alignments + * for each internal node of the tree. + *

    + * So, if the source input sentence is "je lui ai pose un question", if we + * turn on both configuration options, we end up with a decorated tree like + * this: + * "(ROOT{0-6} (S{0-6} (NP{0-1} i) (VP{1-6} (V{2-4} asked) (NP{1-2} her) + * (NP{4-6} (DT{4-5} a) (N{5-6} question)))))". + *

    + * This class contains all the information of that flat string representation: + * the tree structure, the output (English) words, and the alignments to a + * source sentence. + *

    + * Using a Tree the source sentence it was aligned to, we can create + * a DerivationTree object suitable for display. + * + * @author Jonny Weese jonny@cs.jhu.edu + */ +public class Tree { + + /** + * An array holding the label of each node of the tree, in depth-first order. + * The label of a node means the NT label assigned to an internal node, or + * the terminal symbol (English word) at a leaf. + */ + private final String [] labels; + + /** + * The number of children of each node of the tree, in depth-first order. + */ + private final int [] numChildren; + + /** + * The smallest source-side index that each node covers, in depth-first order. + * Note that we only have this information for internal nodes. For leaves, + * this value will always be -1. + */ + private final int [] sourceStartIndices; + + /** + * 1 + the largest source-side index that each node covers, in depth-first + * order. Note that we only have this informaion for internal nodes. For + * leaves, this value will always be -1. + */ + private final int [] sourceEndIndices; + + /** + * A pattern to match an aligned internal node and pull out its information. + * This pattern matches: + * + * 1) start-of-string + * 2) ( + * 3) an arbitrary sequence of non-whitespace characters (at least 1) + * 4) { + * 5) a decimal number + * 6) - + * 7) a decimal number + * 8) } + * 9) end-of-string + * + * That is, it matches something like "(FOO{32-55}". The string and two + * decimal numbers (parts 3, 5, and 7) are captured in groups. + */ + private static final Pattern NONTERMINAL_PATTERN = + Pattern.compile("^\\((\\S+)\\{(\\d+)-(\\d+)\\}$"); + + /** + * Creates a Tree object from an input string in Penn treebank format with + * source alignment annotations. + * @param s an input string in Penn treebank format with source alignment annotations + */ + public Tree(String s) { + final String [] tokens = s.replaceAll("\\)", " )").split("\\s+"); + int numNodes = 0; + for (String t : tokens) { + if (!t.equals(")")) { + numNodes++; + } + } + labels = new String[numNodes]; + numChildren = new int[numNodes]; + sourceStartIndices = new int[numNodes]; + sourceEndIndices = new int[numNodes]; + try { + initialize(tokens); + } catch (Exception e) { + // This will catch most formatting errors. + throw new IllegalArgumentException( + String.format("couldn't create tree from string: \"%s\"", s), + e); + } + } + + private void initialize(String [] tokens) { + final Stack stack = new Stack(); + int nodeIndex = 0; + for (String token : tokens) { + final Matcher matcher = NONTERMINAL_PATTERN.matcher(token); + if (matcher.matches()) { + // new non-terminal node + labels[nodeIndex] = matcher.group(1); + sourceStartIndices[nodeIndex] = Integer.parseInt(matcher.group(2)); + sourceEndIndices[nodeIndex] = Integer.parseInt(matcher.group(3)); + stack.push(nodeIndex); + nodeIndex++; + } else if (token.equals(")")) { + // finished a subtree + stack.pop(); + if (stack.empty()) { + break; + } else { + numChildren[stack.peek()]++; + } + } else { + // otherwise, it's a new leaf node + labels[nodeIndex] = token; + sourceStartIndices[nodeIndex] = -1; + sourceEndIndices[nodeIndex] = -1; + numChildren[stack.peek()]++; + nodeIndex++; + } + } + if (!stack.empty()) { + // Not enough close-parentheses at the end of the tree. + throw new IllegalArgumentException(); + } + } + + /** + * Return the number of nodes in this Tree. + * @return the number of nodes in this Tree + */ + public int size() { + return labels.length; + } + + /** + * Get the root Node of this Tree. + * @return the Node present at the toom the this Tree + */ + public Node root() { + return new Node(0); + } + + private List childIndices(int index) { + List result = new ArrayList(); + int remainingChildren = numChildren[index]; + int childIndex = index + 1; + while (remainingChildren > 0) { + result.add(childIndex); + childIndex = nextSiblingIndex(childIndex); + remainingChildren--; + } + return result; + } + + private int nextSiblingIndex(int index) { + int result = index + 1; + int remainingChildren = numChildren[index]; + for (int i = 0; i < remainingChildren; i++) { + result = nextSiblingIndex(result); + } + return result; + } + + public String yield() { + String result = ""; + for (int i = 0; i < labels.length; i++) { + if (numChildren[i] == 0) { + if (!result.equals("")) { + result += " "; + } + result += labels[i]; + } + } + return result; + } + + @Override + public String toString() { + return root().toString(); + } + + /** + * A class representing the Nodes of a tree. + */ + public class Node { + + /** + * The index into the Tree class's internal arrays. + */ + private final int index; + + private Node(int i) { + index = i; + } + + /** + * Get the label for this node. If the node is internal to the tree, its + * label is the non-terminal label assigned to it. If it is a leaf node, + * the label is the English word at the leaf. + * @return a string representing the label for this node + */ + public String label() { + return labels[index]; + } + + public boolean isLeaf() { + return numChildren[index] == 0; + } + + public int sourceStartIndex() { + return sourceStartIndices[index]; + } + + public int sourceEndIndex() { + return sourceEndIndices[index]; + } + + public List children() { + List result = new ArrayList(); + for (int j : childIndices(index)) { + result.add(new Node(j)); + } + return result; + } + + @Override + public String toString() { + if (isLeaf()) { + return label(); + } + String result = String.format("(%s{%d-%d}", + label(), + sourceStartIndex(), + sourceEndIndex()); + for (Node c : children()) { + result += String.format(" %s", c); + } + return result + ")"; + } + } + + public static class NodeSourceStartComparator implements Comparator { + public int compare(Node a, Node b) { + return a.sourceStartIndex() - b.sourceStartIndex(); + } + } +} diff --git a/src/joshua/util/Algorithms.java b/src/main/java/org/apache/joshua/util/Algorithms.java similarity index 96% rename from src/joshua/util/Algorithms.java rename to src/main/java/org/apache/joshua/util/Algorithms.java index 0f25ee26..327c8827 100644 --- a/src/joshua/util/Algorithms.java +++ b/src/main/java/org/apache/joshua/util/Algorithms.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; public final class Algorithms { @@ -25,7 +25,9 @@ public final class Algorithms { * * The code is based on the example by Michael Gilleland found at * http://www.merriampark.com/ld.htm. - * + * @param candidate todo + * @param source todo + * @return the minimum edit distance. */ public static final int levenshtein(String[] candidate, String[] source) { // First check to see whether either of the arrays diff --git a/src/joshua/util/Bits.java b/src/main/java/org/apache/joshua/util/Bits.java similarity index 91% rename from src/joshua/util/Bits.java rename to src/main/java/org/apache/joshua/util/Bits.java index 2b95a5e2..b5294f6a 100644 --- a/src/joshua/util/Bits.java +++ b/src/main/java/org/apache/joshua/util/Bits.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; /** * Utility class for bit twiddling. @@ -28,9 +28,9 @@ public class Bits { /** * Encodes two shorts in an int. * - * @param high - * @param low - * @return + * @param high input high short to encode + * @param low input low short to encode + * @return encoded int */ public static int encodeAsInt(short high, short low) { @@ -79,9 +79,9 @@ public static short decodeLowBits(int i) { /** * Encodes two integers in a long. * - * @param high - * @param low - * @return + * @param high input high int to encode + * @param low input low int to encode + * @return encoded long */ public static long encodeAsLong(int high, int low) { diff --git a/src/joshua/util/BotMap.java b/src/main/java/org/apache/joshua/util/BotMap.java similarity index 98% rename from src/joshua/util/BotMap.java rename to src/main/java/org/apache/joshua/util/BotMap.java index 32dea01e..1cc82b50 100644 --- a/src/joshua/util/BotMap.java +++ b/src/main/java/org/apache/joshua/util/BotMap.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.Collection; import java.util.Collections; diff --git a/src/joshua/util/Cache.java b/src/main/java/org/apache/joshua/util/Cache.java similarity index 87% rename from src/joshua/util/Cache.java rename to src/main/java/org/apache/joshua/util/Cache.java index 8da994bc..0d72f8a4 100644 --- a/src/joshua/util/Cache.java +++ b/src/main/java/org/apache/joshua/util/Cache.java @@ -16,13 +16,14 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; // Imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.util.LinkedHashMap; import java.util.Map; -import java.util.logging.Level; -import java.util.logging.Logger; /** * Cache is a class that implements a least recently used cache. It is a straightforward extension @@ -41,8 +42,7 @@ public class Cache extends LinkedHashMap { private static final long serialVersionUID = 6073387072740892061L; /** Logger for this class. */ - private static Logger logger = Logger.getLogger(Cache.class.getName()); - + private static final Logger LOG = LoggerFactory.getLogger(Cache.class); // =============================================================== // Constants // =============================================================== @@ -104,20 +104,14 @@ public Cache() { @Override public V get(Object key) { - if (logger.isLoggable(Level.FINEST)) { - logger.finest("Cache get key: " + key.toString()); - } + LOG.debug("Cache get key: {}", key); return super.get(key); } @Override public V put(K key, V value) { - - if (logger.isLoggable(Level.FINEST)) { - logger.finest("Cache put key: " + key.toString()); - } - + LOG.debug("Cache put key: {}", key); return super.put(key, value); } @@ -128,13 +122,11 @@ public V put(K key, V value) { @Override public boolean containsKey(Object key) { boolean contains = super.containsKey(key); - - if (logger.isLoggable(Level.FINEST)) { - String message = - (contains) ? "Cache has key: " + key.toString() : "Cache lacks key: " + key.toString(); - logger.finest(message); + if (contains){ + LOG.debug("Cache has key: {}", key); + } else { + LOG.debug("Cache lacks key: {}", key); } - return contains; } @@ -157,11 +149,9 @@ public boolean containsKey(Object key) { */ protected boolean removeEldestEntry(Map.Entry eldest) { boolean removing = size() > maxCapacity; - - if (removing && logger.isLoggable(Level.FINEST)) { - logger.finest("Cache loses key: " + eldest.getKey().toString()); + if (removing ) { + LOG.debug("Cache loses key: {}", eldest.getKey()); } - return removing; } diff --git a/src/joshua/util/ChartSpan.java b/src/main/java/org/apache/joshua/util/ChartSpan.java similarity index 81% rename from src/joshua/util/ChartSpan.java rename to src/main/java/org/apache/joshua/util/ChartSpan.java index 81c6aaab..b22d2aa9 100644 --- a/src/joshua/util/ChartSpan.java +++ b/src/main/java/org/apache/joshua/util/ChartSpan.java @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; /** * CKY-based decoding makes extensive use of charts, which maintain information about spans (i, j) - * over the length-n input sentence, 0 <= i <= j <= n. These charts are used for many things; for + * over the length-n input sentence, 0 <= i <= j <= n. These charts are used for many things; for * example, lattices use a chart to denote whether there is a path between nodes i and j, and what - * their costs is, and the decoder uses charts to record the partial application of rules ( - * {@link DotChart}) and the existence of proved items ({@link PhraseChart}). + * their costs is, and the decoder uses charts to record the partial application of rules (DotChart}) + * and the existence of proved items ({@link org.apache.joshua.decoder.phrase.PhraseChart}). * * The dummy way to implement a chart is to initialize a two-dimensional array; however, this wastes - * a lot of space, because the constraint (i <= j) means that only half of this space can ever be + * a lot of space, because the constraint (i <= j) means that only half of this space can ever be * used. This is especially a problem for lattices, where the sentence length (n) is the number of * nodes in the lattice! * @@ -34,7 +34,7 @@ * spans under a given maximum length. This class implements that in a generic way, introducing * large savings in both space and time. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class ChartSpan { Object[] chart; @@ -64,17 +64,14 @@ public void set(int i, int j, Type value) { /** * This computes the offset into the one-dimensional array for a given span. * - * @param i - * @param j + * @param i source node in span + * @param j target node in span * @return the offset - * @throws InvalidSpanException */ private int offset(int i, int j) { if (i < 0 || j > max || i > j) { throw new RuntimeException(String.format("Invalid span (%d,%d | %d)", i, j, max)); } - - // System.err.println(String.format("ChartSpan::offset(%d,%d) = %d / %d", i, j, i * (max + 1) - i * (i + 1) / 2 + j, max * (max + 1) - max * (max + 1) / 2 + max)); return i * (max + 1) - i * (i + 1) / 2 + j; } @@ -82,7 +79,7 @@ private int offset(int i, int j) { /** * Convenience function for setting the values along the diagonal. * - * @param value + * @param value input Type for which to set values */ public void setDiagonal(Type value) { for (int i = 0; i <= max; i++) diff --git a/src/joshua/util/CommandLineParser.java b/src/main/java/org/apache/joshua/util/CommandLineParser.java similarity index 99% rename from src/joshua/util/CommandLineParser.java rename to src/main/java/org/apache/joshua/util/CommandLineParser.java index d79fd55e..974b9732 100644 --- a/src/joshua/util/CommandLineParser.java +++ b/src/main/java/org/apache/joshua/util/CommandLineParser.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.Collection; import java.util.HashMap; diff --git a/src/joshua/util/CompareGrammars.java b/src/main/java/org/apache/joshua/util/CompareGrammars.java similarity index 77% rename from src/joshua/util/CompareGrammars.java rename to src/main/java/org/apache/joshua/util/CompareGrammars.java index 109d7a19..1dabac57 100644 --- a/src/joshua/util/CompareGrammars.java +++ b/src/main/java/org/apache/joshua/util/CompareGrammars.java @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.io.File; import java.io.FileNotFoundException; import java.util.HashSet; import java.util.Scanner; import java.util.Set; -import java.util.logging.Level; -import java.util.logging.Logger; -import joshua.decoder.ff.tm.format.HieroFormatReader; +import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class allows two grammars (loaded from disk) to be compared. @@ -35,8 +35,7 @@ */ public class CompareGrammars { - /** Logger for this class. */ - private static final Logger logger = Logger.getLogger(CompareGrammars.class.getName()); + private static final Logger LOG = LoggerFactory.getLogger(CompareGrammars.class); /** * Gets a set containing all unique instances of the specified field. @@ -45,7 +44,7 @@ public class CompareGrammars { * @param fieldDelimiter Regular expression to split each line * @param fieldNumber Field from each rule to extract * @return set containing all unique instances of the specified field - * @throws FileNotFoundException + * @throws FileNotFoundException if the input grammer file cannot be found */ public static Set getFields(File grammarFile, String fieldDelimiter, int fieldNumber) throws FileNotFoundException { @@ -101,13 +100,12 @@ public static void compareValues(File grammarFile1, File grammarFile2, String fi float diff = (diff1 < diff2) ? diff1 : diff2; if (diff > delta) { - logger.fine("Line " + counter + ": Score mismatch: " + score1 + " vs " + score2); + LOG.debug("Line {}: Score mismatch: {} vs {}", counter, score1, score2); set.add(line1); totalOverDiffs += diff; - } else if (logger.isLoggable(Level.FINEST)) { - logger.finest("Line " + counter + ": Scores MATCH: " + score1 + " vs " + score2); + } else { + LOG.debug("Line {}: Scores MATCH: {} vs ", counter, score1, score2); } - } else { throw new RuntimeException("Lines don't match: " + line1 + " and " + line2); } @@ -117,11 +115,11 @@ public static void compareValues(File grammarFile1, File grammarFile2, String fi grammarScanner2.close(); if (set.isEmpty()) { - logger.info("No score mismatches"); + LOG.info("No score mismatches"); } else { - logger.warning("Number of mismatches: " + set.size() + " out of " + counter); - logger.warning("Total mismatch logProb mass: " + totalOverDiffs + " (" + totalOverDiffs - / set.size() + ") (" + totalOverDiffs / counter + ")"); + LOG.warn("Number of mismatches: {} out of {}", set.size(), counter); + LOG.warn("Total mismatch logProb mass: {} ({}) ({})", totalOverDiffs, + totalOverDiffs / set.size(), totalOverDiffs / counter); } } @@ -129,20 +127,20 @@ public static void compareValues(File grammarFile1, File grammarFile2, String fi * Main method. * * @param args names of the two grammars to be compared - * @throws FileNotFoundException + * @throws FileNotFoundException if any of the input grammer file cannot be found */ public static void main(String[] args) throws FileNotFoundException { if (args.length != 2) { - logger.severe("Usage: " + CompareGrammars.class.toString() + " grammarFile1 grammarFile2"); + LOG.error("Usage: {} grammarFile1 grammarFile2", CompareGrammars.class.toString()); System.exit(-1); } // Tell standard in and out to use UTF-8 FormatUtils.useUTF8(); - logger.finest("Using UTF-8"); + LOG.debug("Using UTF-8"); - logger.info("Comparing grammar files " + args[0] + " and " + args[1]); + LOG.info("Comparing grammar files {} and {}", args[0], args[1]); File grammarFile1 = new File(args[0]); File grammarFile2 = new File(args[1]); @@ -157,9 +155,9 @@ public static void main(String[] args) throws FileNotFoundException { Set leftHandSides2 = getFields(grammarFile2, fieldDelimiter, 0); if (leftHandSides1.equals(leftHandSides2)) { - logger.info("Grammar files have the same set of left-hand sides"); + LOG.info("Grammar files have the same set of left-hand sides"); } else { - logger.warning("Grammar files have differing sets of left-hand sides"); + LOG.warn("Grammar files have differing sets of left-hand sides"); compareScores = false; } } @@ -170,9 +168,9 @@ public static void main(String[] args) throws FileNotFoundException { Set sourceRHSs2 = getFields(grammarFile2, fieldDelimiter, 1); if (sourceRHSs1.equals(sourceRHSs2)) { - logger.info("Grammar files have the same set of source right-hand sides"); + LOG.info("Grammar files have the same set of source right-hand sides"); } else { - logger.warning("Grammar files have differing sets of source right-hand sides"); + LOG.warn("Grammar files have differing sets of source right-hand sides"); compareScores = false; } } @@ -184,9 +182,9 @@ public static void main(String[] args) throws FileNotFoundException { Set targetRHSs2 = getFields(grammarFile2, fieldDelimiter, 2); if (targetRHSs1.equals(targetRHSs2)) { - logger.info("Grammar files have the same set of target right-hand sides"); + LOG.info("Grammar files have the same set of target right-hand sides"); } else { - logger.warning("Grammar files have differing sets of target right-hand sides"); + LOG.warn("Grammar files have differing sets of target right-hand sides"); compareScores = false; } } @@ -197,11 +195,6 @@ public static void main(String[] args) throws FileNotFoundException { compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 0, delta); compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 1, delta); compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 2, delta); - } - } - - - } diff --git a/src/joshua/util/Counted.java b/src/main/java/org/apache/joshua/util/Counted.java similarity index 98% rename from src/joshua/util/Counted.java rename to src/main/java/org/apache/joshua/util/Counted.java index 1014e124..9f719b37 100644 --- a/src/joshua/util/Counted.java +++ b/src/main/java/org/apache/joshua/util/Counted.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.Comparator; @@ -78,8 +78,7 @@ public int compareTo(Counted o) { /** * Gets a comparator that compares two counted objects based on the reverse of the natural order * of the counts associated with each object. - * - * @param + * @param todo * @return A comparator that compares two counted objects based on the reverse of the natural * order of the counts associated with each object */ diff --git a/src/joshua/util/Counts.java b/src/main/java/org/apache/joshua/util/Counts.java similarity index 96% rename from src/joshua/util/Counts.java rename to src/main/java/org/apache/joshua/util/Counts.java index 4a20009b..89a9f38f 100644 --- a/src/joshua/util/Counts.java +++ b/src/main/java/org/apache/joshua/util/Counts.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.io.IOException; import java.io.ObjectInput; @@ -31,8 +31,6 @@ * * @author Lane Schwartz * @author Chris Callison-Burch - * @param - * @param */ public class Counts implements Iterable> { @@ -78,8 +76,8 @@ public Counts(float floorProbability) { /** * Increments the co-occurrence count of the provided objects. * - * @param a - * @param b + * @param a input object A + * @param b input object B */ public void incrementCount(A a, B b) { // increment the count and handle the adding of objects to the map if they aren't already there @@ -127,8 +125,8 @@ public void incrementCount(A a, B b) { /** * Gets the co-occurrence count for the two elements. * - * @param a - * @param b + * @param a input object A + * @param b input object B * @return the co-occurrence count for the two elements */ public int getCount(A a, B b) { @@ -161,8 +159,8 @@ int getCount(B b) { *

    * This value is the relative frequency estimate. * - * @param a - * @param b + * @param a object A + * @param b object B * @return the probability of a given b. */ public float getProbability(A a, B b) { @@ -202,8 +200,8 @@ public float getProbability(A a, B b) { *

    * This value is the relative frequency estimate in the reverse direction. * - * @param b - * @param a + * @param b object B + * @param a object A * @return the probability of b given a. */ public float getReverseProbability(B b, A a) { diff --git a/src/joshua/util/ExtractTopCand.java b/src/main/java/org/apache/joshua/util/ExtractTopCand.java similarity index 88% rename from src/joshua/util/ExtractTopCand.java rename to src/main/java/org/apache/joshua/util/ExtractTopCand.java index c24f9702..8f9b575e 100644 --- a/src/joshua/util/ExtractTopCand.java +++ b/src/main/java/org/apache/joshua/util/ExtractTopCand.java @@ -16,22 +16,23 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.io.BufferedWriter; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; -import joshua.util.io.IndexedReader; -import joshua.util.io.LineReader; - +import org.apache.joshua.util.io.IndexedReader; +import org.apache.joshua.util.io.LineReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This program extracts the 1-best output translations from the n-best output translations - * generated by {@link joshua.decoder.Decoder}. + * generated by {@link org.apache.joshua.decoder.Decoder}. * - * @author wren ng thornton + * @author wren ng thornton wren@users.sourceforge.net * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $ */ /* @@ -42,13 +43,16 @@ */ public class ExtractTopCand { + private static final Logger LOG = LoggerFactory.getLogger(ExtractTopCand.class); + /** * Usage: java ExtractTopCand nbestInputFile 1bestOutputFile. *

    * If the input file name is "-" then input is read from System.in. If the output * file name is "-" then output is directed to System.out. If a file already exists * with the output file name, it is truncated before writing. The bulk of this program is - * implemented by {@link #extractOneBest(IndexedReader,BufferedWriter)}. + * implemented by {@link org.apache.joshua.util.ExtractTopCand#extractOneBest(IndexedReader, BufferedWriter, int)}. + * @param args input arguments for the tool */ public static void main(String[] args) { String inFile = "-"; @@ -98,7 +102,7 @@ public static void main(String[] args) { // block. Printing to a closed PrintStream generates // no exceptions. We should be printing to System.err // anyways, but this something subtle to be aware of. - System.err.println("There was an error: " + ioe.getMessage()); + LOG.error(ioe.getMessage(), ioe); } } @@ -109,7 +113,7 @@ public static void main(String[] args) { * first occurance of the segment ID. Any information about the segment other than the translation * (including segment ID) is not printed to the writer. * - *

    Developer Notes

    This implementation assumes: + * Developer Notes This implementation assumes: *
      *
    1. all translations for a segment are contiguous
    2. *
    3. the 1-best translation is the first one encountered.
    4. @@ -117,10 +121,15 @@ public static void main(String[] args) { * We will need to alter the implementation if these assumptions no longer hold for the output of * JoshuaDecoder (or any sensible n-best format passed to this method). *

      - * We should switch to using an n-best {@link joshua.decoder.segment_file.SegmentFileParser} to + * TODO We should switch to using an n-best SegmentFileParser to * ensure future compatibility with being able to configure the output format of the decoder. The * MERT code needs such a SegmentFileParser anyways, so that will reduce the code duplication * between these two classes. + * + * @param nbestReader todo + * @param onebestWriter todo + * @param field todo + * @throws IOException if there is an issue reading or writing input/output data */ protected static void extractOneBest(IndexedReader nbestReader, BufferedWriter onebestWriter, int field) throws IOException { diff --git a/src/joshua/util/FileUtility.java b/src/main/java/org/apache/joshua/util/FileUtility.java similarity index 85% rename from src/joshua/util/FileUtility.java rename to src/main/java/org/apache/joshua/util/FileUtility.java index 06856553..9dad55aa 100644 --- a/src/joshua/util/FileUtility.java +++ b/src/main/java/org/apache/joshua/util/FileUtility.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -39,8 +39,8 @@ /** * utility functions for file operations * - * @author Zhifei Li, - * @author wren ng thornton + * @author Zhifei Li, zhifei.work@gmail.com + * @author wren ng thornton wren@users.sourceforge.net * @since 28 February 2009 */ public class FileUtility { @@ -52,7 +52,12 @@ public class FileUtility { */ private static final Charset FILE_ENCODING = Charset.forName(DEFAULT_ENCODING); - /** Warning, will truncate/overwrite existing files */ + /** + * Warning, will truncate/overwrite existing files + * @param filename a file for which to obtain a writer + * @return the buffered writer object + * @throws IOException if there is a problem reading the inout file + */ public static BufferedWriter getWriteFileStream(String filename) throws IOException { return new BufferedWriter(new OutputStreamWriter( // TODO: add GZIP @@ -83,7 +88,7 @@ public static boolean deleteRecursively(File f) { * * @param data The integer array to write to disk. * @param filename The filename where the data should be written. - * @throws IOException + * @throws IOException if there is a problem writing to the output file * @return the FileOutputStream on which the bytes were written */ public static FileOutputStream writeBytes(int[] data, String filename) throws IOException { @@ -97,7 +102,7 @@ public static FileOutputStream writeBytes(int[] data, String filename) throws IO * * @param data The integer array to write to disk. * @param out The output stream where the data should be written. - * @throws IOException + * @throws IOException if there is a problem writing bytes */ public static void writeBytes(int[] data, OutputStream out) throws IOException { @@ -118,11 +123,8 @@ public static void copyFile(String srFile, String dtFile) throws IOException { File f1 = new File(srFile); File f2 = new File(dtFile); copyFile(f1, f2); - } catch (FileNotFoundException ex) { - System.out.println(ex.getMessage() + " in the specified directory."); - System.exit(0); } catch (IOException e) { - System.out.println(e.getMessage()); + throw new RuntimeException(e); } } @@ -145,11 +147,8 @@ public static void copyFile(File srFile, File dtFile) throws IOException { in.close(); out.close(); System.out.println("File copied."); - } catch (FileNotFoundException ex) { - System.out.println(ex.getMessage() + " in the specified directory."); - System.exit(0); } catch (IOException e) { - System.out.println(e.getMessage()); + throw new RuntimeException(e); } } @@ -182,8 +181,10 @@ static public boolean deleteFile(String fileName) { } /** - * Returns the base directory of the file. For example, dirname('/usr/local/bin/emacs') -> + * Returns the base directory of the file. For example, dirname('/usr/local/bin/emacs') -> * '/usr/local/bin' + * @param fileName the input path + * @return the parent path */ static public String dirname(String fileName) { if (fileName.indexOf(File.separator) != -1) @@ -217,23 +218,23 @@ public static void closeCloseableIfNotNull(Closeable fileWriter) { * Returns the directory were the program has been started, * the base directory you will implicitly get when specifying no * full path when e.g. opening a file - * @return + * @return the current 'user.dir' */ public static String getWorkingDirectory() { return System.getProperty("user.dir"); } /** - * Method to handle standard IO xceptions. catch (Exception e) {Utility.handleIO_exception(e);} + * Method to handle standard IO exceptions. catch (Exception e) {Utility.handleIO_exception(e);} + * @param e an input {@link java.lang.Exception} */ public static void handleExceptions(Exception e) { - e.printStackTrace(); - System.exit(-1); + throw new RuntimeException(e); } /** * Convenience method to get a full file as a String - * @param file + * @param file the input {@link java.io.File} * @return The file as a String. Lines are separated by newline character. */ public static String getFileAsString(File file) { @@ -252,6 +253,9 @@ public static String getFileAsString(File file) { * This method returns a List of String. Each element of the list corresponds to a line from the * input file. The boolean keepDuplicates in the input determines if duplicate lines are allowed * in the output LinkedList or not. + * @param file the input file + * @param keepDuplicates whether to retain duplicate lines + * @return a {@link java.util.List} of lines */ static public List getLines(File file, boolean keepDuplicates) { LinkedList list = new LinkedList(); @@ -280,8 +284,9 @@ static public List getLines(File file, boolean keepDuplicates) { /** * Returns a Scanner of the inputFile using a specific encoding * - * @param inputFile - * @return : Scanner + * @param inputFile the file for which to get a {@link java.util.Scanner} object + * @param encoding the encoding to use within the Scanner + * @return a {@link java.util.Scanner} object for a given file */ public static Scanner getScanner(File inputFile, String encoding) { Scanner scan = null; @@ -296,8 +301,8 @@ public static Scanner getScanner(File inputFile, String encoding) { /** * Returns a Scanner of the inputFile using default encoding * - * @param inputFile - * @return : Scanner + * @param inputFile the file for which to get a {@link java.util.Scanner} object + * @return a {@link java.util.Scanner} object for a given file */ public static Scanner getScanner(File inputFile) { return getScanner(inputFile, DEFAULT_ENCODING); diff --git a/src/joshua/util/FormatUtils.java b/src/main/java/org/apache/joshua/util/FormatUtils.java similarity index 88% rename from src/joshua/util/FormatUtils.java rename to src/main/java/org/apache/joshua/util/FormatUtils.java index 67b2bf33..eb594803 100644 --- a/src/joshua/util/FormatUtils.java +++ b/src/main/java/org/apache/joshua/util/FormatUtils.java @@ -16,13 +16,15 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.util.regex.Pattern; -import joshua.corpus.Vocabulary; +import org.apache.joshua.corpus.Vocabulary; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Utility class for format issues. @@ -31,14 +33,16 @@ * @author Lane Schwartz */ public class FormatUtils { - + + private static final Logger LOG = LoggerFactory.getLogger(FormatUtils.class); + private static final String INDEX_SEPARATOR = ","; /** * Determines whether the string is a nonterminal by checking that the first character is [ * and the last character is ]. * - * @param token + * @param token input string * @return true if it's a nonterminal symbol, false otherwise */ public static boolean isNonterminal(String token) { @@ -49,8 +53,8 @@ public static boolean isNonterminal(String token) { * Nonterminals are stored in the vocabulary in square brackets. This removes them when you * just want the raw nonterminal word. * Supports indexed and non-indexed nonTerminals: - * [GOAL] -> GOAL - * [X,1] -> [X] + * [GOAL] -> GOAL + * [X,1] -> [X] * * @param nt the nonterminal, e.g., "[GOAL]" * @return the cleaned nonterminal, e.g., "GOAL" @@ -72,7 +76,9 @@ private static boolean isIndexedNonTerminal(String nt) { } /** - * Removes the index from a nonTerminal: [X,1] -> [X]. + * Removes the index from a nonTerminal: [X,1] -> [X]. + * @param nt an input non-terminal string + * @return the stripped non terminal string */ public static String stripNonTerminalIndex(String nt) { return markup(cleanNonTerminal(nt)); @@ -117,6 +123,8 @@ public static String unescapeSpecialSymbols(String s) { /** * wrap sentence with sentence start/stop markers * as defined by Vocabulary; separated by a single whitespace. + * @param s an input sentence + * @return the wrapped sentence */ public static String addSentenceMarkers(String s) { return Vocabulary.START_SYM + " " + s + " " + Vocabulary.STOP_SYM; @@ -124,6 +132,8 @@ public static String addSentenceMarkers(String s) { /** * strip sentence markers (and whitespaces) from string + * @param s the sentence to strip of markers (and whitespaces) + * @return the stripped string */ public static String removeSentenceMarkers(String s) { return s.replaceAll(" ", "").replace(" ", ""); @@ -134,7 +144,7 @@ public static String removeSentenceMarkers(String s) { *

      * The body of this method is taken from the Javadoc documentation for the Java Double class. * - * @param string + * @param string an input string * @see java.lang.Double * @return true if the string represents a valid number, false otherwise */ @@ -193,12 +203,10 @@ public static boolean useUTF8() { System.setErr(new PrintStream(System.err, true, "UTF8")); return true; } catch (UnsupportedEncodingException e1) { - System.err - .println("UTF8 is not a valid encoding; using system default encoding for System.out and System.err."); + LOG.warn("UTF8 is not a valid encoding; using system default encoding for System.out and System.err."); return false; } catch (SecurityException e2) { - System.err - .println("Security manager is configured to disallow changes to System.out or System.err; using system default encoding."); + LOG.warn("Security manager is configured to disallow changes to System.out or System.err; using system default encoding."); return false; } } @@ -206,7 +214,7 @@ public static boolean useUTF8() { /** * Determines if a string contains ALL CAPS * - * @param token + * @param token an input token * @return true if the string is all in uppercase, false otherwise */ public static boolean ISALLUPPERCASE(String token) { diff --git a/src/joshua/util/IntegerPair.java b/src/main/java/org/apache/joshua/util/IntegerPair.java similarity index 97% rename from src/joshua/util/IntegerPair.java rename to src/main/java/org/apache/joshua/util/IntegerPair.java index 08cefe17..bfbfa23e 100644 --- a/src/joshua/util/IntegerPair.java +++ b/src/main/java/org/apache/joshua/util/IntegerPair.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; /** * Memory-efficient implementation of an integer tuple. diff --git a/src/joshua/util/JoshuaEval.java b/src/main/java/org/apache/joshua/util/JoshuaEval.java similarity index 88% rename from src/joshua/util/JoshuaEval.java rename to src/main/java/org/apache/joshua/util/JoshuaEval.java index 6c0761a2..31716419 100644 --- a/src/joshua/util/JoshuaEval.java +++ b/src/main/java/org/apache/joshua/util/JoshuaEval.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.io.BufferedReader; import java.io.File; @@ -29,7 +29,7 @@ import java.text.DecimalFormat; import java.util.TreeSet; -import joshua.metrics.EvaluationMetric; +import org.apache.joshua.metrics.EvaluationMetric; public class JoshuaEval { final static DecimalFormat f4 = new DecimalFormat("###0.0000"); @@ -102,13 +102,11 @@ private static void evaluate(String inFileName, String inFileFormat, int candPer // testIndex=candPerSen means last candidate should be evaluated if (inFileFormat.equals("plain") && candPerSen < 1) { - println("candPerSen must be positive for a file in plain format."); - System.exit(30); + throw new RuntimeException("candPerSen must be positive for a file in plain format."); } if (inFileFormat.equals("plain") && (testIndex < 1 || testIndex > candPerSen)) { - println("For the plain format, testIndex must be in [1,candPerSen]"); - System.exit(31); + throw new RuntimeException("For the plain format, testIndex must be in [1,candPerSen]"); } @@ -170,8 +168,7 @@ private static void evaluate(String inFileName, String inFileFormat, int candPer if (line == null) { println("Not enough candidates in " + inFileName + " to extract the " + candRank + "'th candidate for each sentence."); - println("(Failed to extract one for the " + i + "'th sentence (0-indexed).)"); - System.exit(32); + throw new RuntimeException("(Failed to extract one for the " + i + "'th sentence (0-indexed).)"); } int read_i = Integer.parseInt(line.substring(0, line.indexOf(" |||")).trim()); @@ -188,30 +185,25 @@ private static void evaluate(String inFileName, String inFileFormat, int candPer n = 1; i += 1; } else { - println("Not enough candidates in " + inFileName + " to extract the " + candRank - + "'th candidate for each sentence."); - println("(Failed to extract one for the " + i + "'th sentence (0-indexed).)"); - System.exit(32); + String msg = "Not enough candidates in " + inFileName + " to extract the " + candRank + + "'th candidate for each sentence. (Failed to extract one for the " + + i + "'th sentence (0-indexed).)"; + throw new RuntimeException(msg); } } // while (line != null) if (i != numSentences) { - println("Not enough candidates were found (i = " + i + "; was expecting " + numSentences + throw new RuntimeException("Not enough candidates were found (i = " + i + "; was expecting " + numSentences + ")"); - System.exit(33); } } // nbest format inFile.close(); - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in MertCore.initialize(int): " + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in MertCore.initialize(int): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } @@ -296,28 +288,24 @@ private static void processArgsAndInitialize(String[] args) { } else if (option.equals("-format")) { candFileFormat = args[argno + 1]; if (!candFileFormat.equals("plain") && !candFileFormat.equals("nbest")) { - println("candFileFormat must be either plain or nbest."); - System.exit(10); + throw new RuntimeException("candFileFormat must be either plain or nbest."); } } else if (option.equals("-rank")) { candRank = Integer.parseInt(args[argno + 1]); if (refsPerSen < 1) { - println("Argument for -rank must be positive."); - System.exit(10); + throw new RuntimeException("Argument for -rank must be positive."); } } else if (option.equals("-ref")) { refFileName = args[argno + 1]; } else if (option.equals("-rps")) { refsPerSen = Integer.parseInt(args[argno + 1]); if (refsPerSen < 1) { - println("refsPerSen must be positive."); - System.exit(10); + throw new RuntimeException("refsPerSen must be positive."); } } else if (option.equals("-txtNrm")) { textNormMethod = Integer.parseInt(args[argno + 1]); if (textNormMethod < 0 || textNormMethod > 4) { - println("textNormMethod should be between 0 and 4"); - System.exit(10); + throw new RuntimeException("textNormMethod should be between 0 and 4"); } } else if (option.equals("-m")) { metricName = args[argno + 1]; @@ -329,8 +317,7 @@ private static void processArgsAndInitialize(String[] args) { } argno += optionCount; } else { - println("Unknown metric name " + metricName + "."); - System.exit(10); + throw new RuntimeException("Unknown metric name " + metricName + "."); } } else if (option.equals("-evr")) { int evr = Integer.parseInt(args[argno + 1]); @@ -339,8 +326,7 @@ private static void processArgsAndInitialize(String[] args) { } else if (evr == 0) { evaluateRefs = false; } else { - println("evalRefs must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("evalRefs must be either 0 or 1."); } } else if (option.equals("-v")) { int v = Integer.parseInt(args[argno + 1]); @@ -349,12 +335,10 @@ private static void processArgsAndInitialize(String[] args) { } else if (v == 0) { verbose = false; } else { - println("verbose must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("verbose must be either 0 or 1."); } } else { - println("Unknown option " + option); - System.exit(10); + throw new RuntimeException("Unknown option " + option); } argno += 2; @@ -366,8 +350,7 @@ private static void processArgsAndInitialize(String[] args) { if (! new File(refFile).exists()) refFile = refFileName + ".0"; if (! new File(refFile).exists()) { - System.err.println(String.format("* FATAL: can't find first reference file '%s{0,.0}'", refFileName)); - System.exit(1); + throw new RuntimeException(String.format("* FATAL: can't find first reference file '%s{0,.0}'", refFileName)); } numSentences = countLines(refFile); @@ -390,8 +373,7 @@ private static void processArgsAndInitialize(String[] args) { if (! new File(refFile).exists()) refFile = refFileName + "." + i; if (! new File(refFile).exists()) { - System.err.println(String.format("* FATAL: can't find reference file '%s'", refFile)); - System.exit(1); + throw new RuntimeException(String.format("* FATAL: can't find reference file '%s'", refFile)); } reference_readers[i] = new BufferedReader(new InputStreamReader(new FileInputStream(new File(refFile)), "utf8")); @@ -409,12 +391,8 @@ private static void processArgsAndInitialize(String[] args) { for (int i = 0; i < refsPerSen; i++) reference_readers[i].close(); - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in JoshuaEval.processArgsAndInitialize(): " + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in JoshuaEval.processArgsAndInitialize(): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } // set static data members for the EvaluationMetric class @@ -587,8 +565,7 @@ private static int countLines(String fileName) { inFile.close(); } catch (IOException e) { - System.err.println("IOException in MertCore.countLines(String): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } return count; diff --git a/src/joshua/util/ListUtil.java b/src/main/java/org/apache/joshua/util/ListUtil.java similarity index 98% rename from src/joshua/util/ListUtil.java rename to src/main/java/org/apache/joshua/util/ListUtil.java index 0ef51909..afb5af12 100644 --- a/src/joshua/util/ListUtil.java +++ b/src/main/java/org/apache/joshua/util/ListUtil.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.List; @@ -65,7 +65,7 @@ public static String objectListString(List list) { * S1,...,Sn * * @param list A list of Strings - * @return + * @return todo */ public static String stringListStringWithoutBrackets(List list) { return stringListStringWithoutBracketsWithSpecifiedSeparator(list, " "); diff --git a/src/joshua/util/Lists.java b/src/main/java/org/apache/joshua/util/Lists.java similarity index 99% rename from src/joshua/util/Lists.java rename to src/main/java/org/apache/joshua/util/Lists.java index 43ffa00b..d62d1aab 100644 --- a/src/joshua/util/Lists.java +++ b/src/main/java/org/apache/joshua/util/Lists.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.Iterator; import java.util.NoSuchElementException; diff --git a/src/joshua/util/NBestListUtility.java b/src/main/java/org/apache/joshua/util/NBestListUtility.java similarity index 98% rename from src/joshua/util/NBestListUtility.java rename to src/main/java/org/apache/joshua/util/NBestListUtility.java index 257f3c08..08c85baa 100644 --- a/src/joshua/util/NBestListUtility.java +++ b/src/main/java/org/apache/joshua/util/NBestListUtility.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.ArrayList; import java.util.List; diff --git a/src/joshua/util/Ngram.java b/src/main/java/org/apache/joshua/util/Ngram.java similarity index 88% rename from src/joshua/util/Ngram.java rename to src/main/java/org/apache/joshua/util/Ngram.java index 7ee1703c..73909cee 100644 --- a/src/joshua/util/Ngram.java +++ b/src/main/java/org/apache/joshua/util/Ngram.java @@ -16,12 +16,12 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.List; import java.util.Map; -import joshua.corpus.Vocabulary; +import org.apache.joshua.corpus.Vocabulary; /** * Provides convenience functions for extracting all ngrams from a sentence, represented as an array @@ -46,7 +46,13 @@ public static void getNgrams(Map tbl, int startOrder, int endOr } } - /** if symbolTbl!=null, then convert interger to String */ + /** + * If symbolTbl!=null, then convert interger to String + * @param tbl todo + * @param startOrder todo + * @param endOrder todo + * @param wrds todo + */ public static void getNgrams(Map tbl, int startOrder, int endOrder, final List wrds) { @@ -64,7 +70,13 @@ public static void getNgrams(Map tbl, int startOrder, int endOr } } - /** if symbolTbl!=null, then convert string to integer */ + /** + * If symbolTbl!=null, then convert string to integer + * @param tbl todo + * @param startOrder todo + * @param endOrder todo + * @param wrds todo + */ public static void getNgrams(Map tbl, int startOrder, int endOrder, final String[] wrds) { diff --git a/src/joshua/util/NullIterator.java b/src/main/java/org/apache/joshua/util/NullIterator.java similarity index 96% rename from src/joshua/util/NullIterator.java rename to src/main/java/org/apache/joshua/util/NullIterator.java index ca0b8ddd..c6e4b46e 100644 --- a/src/joshua/util/NullIterator.java +++ b/src/main/java/org/apache/joshua/util/NullIterator.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.Iterator; import java.util.NoSuchElementException; @@ -25,7 +25,7 @@ /** * This class provides a null-object Iterator. That is, an iterator over an empty collection. * - * @author wren ng thornton + * @author wren ng thornton wren@users.sourceforge.net * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $ */ public class NullIterator implements Iterable, Iterator { diff --git a/src/joshua/util/PackedGrammarServer.java b/src/main/java/org/apache/joshua/util/PackedGrammarServer.java similarity index 88% rename from src/joshua/util/PackedGrammarServer.java rename to src/main/java/org/apache/joshua/util/PackedGrammarServer.java index 3eb6eafc..74c8e4a8 100644 --- a/src/joshua/util/PackedGrammarServer.java +++ b/src/main/java/org/apache/joshua/util/PackedGrammarServer.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.io.FileNotFoundException; import java.io.IOException; @@ -24,12 +24,12 @@ import java.util.List; import java.util.Map; -import joshua.corpus.Vocabulary; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.tm.Rule; -import joshua.decoder.ff.tm.Trie; -import joshua.decoder.ff.tm.packed.PackedGrammar; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.tm.Rule; +import org.apache.joshua.decoder.ff.tm.Trie; +import org.apache.joshua.decoder.ff.tm.packed.PackedGrammar; +import org.apache.joshua.util.io.LineReader; public class PackedGrammarServer { diff --git a/src/joshua/util/Pair.java b/src/main/java/org/apache/joshua/util/Pair.java similarity index 99% rename from src/joshua/util/Pair.java rename to src/main/java/org/apache/joshua/util/Pair.java index 08bf08cf..2dd536de 100644 --- a/src/joshua/util/Pair.java +++ b/src/main/java/org/apache/joshua/util/Pair.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; /** * Represents a pair of elements. diff --git a/src/joshua/util/Platform.java b/src/main/java/org/apache/joshua/util/Platform.java similarity index 96% rename from src/joshua/util/Platform.java rename to src/main/java/org/apache/joshua/util/Platform.java index a14ee7e3..22089da5 100644 --- a/src/joshua/util/Platform.java +++ b/src/main/java/org/apache/joshua/util/Platform.java @@ -16,11 +16,10 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; public class Platform { - public static boolean isMac() { return System.getProperties().getProperty("os.name").toLowerCase().indexOf("mac") != -1; } diff --git a/src/joshua/util/QuietFormatter.java b/src/main/java/org/apache/joshua/util/QuietFormatter.java similarity index 97% rename from src/joshua/util/QuietFormatter.java rename to src/main/java/org/apache/joshua/util/QuietFormatter.java index f8340a1c..72200808 100644 --- a/src/joshua/util/QuietFormatter.java +++ b/src/main/java/org/apache/joshua/util/QuietFormatter.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.logging.Formatter; import java.util.logging.LogRecord; diff --git a/src/joshua/util/Regex.java b/src/main/java/org/apache/joshua/util/Regex.java similarity index 84% rename from src/joshua/util/Regex.java rename to src/main/java/org/apache/joshua/util/Regex.java index 91df0316..e592c110 100644 --- a/src/joshua/util/Regex.java +++ b/src/main/java/org/apache/joshua/util/Regex.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -27,7 +27,7 @@ * the convenience functions on String. The convenience methods on String are deprecated except for * one-shot patterns (which, by definition, are not in loops). * - * @author wren ng thornton + * @author wren ng thornton wren@users.sourceforge.net * @version $LastChangedDate: 2009-03-28 07:40:25 -0400 (Sat, 28 Mar 2009) $ */ public class Regex { @@ -88,6 +88,8 @@ public Regex(String regex) throws PatternSyntaxException { /** * Returns whether the input string matches this Regex. + * @param input a String to match against the Regex + * @return true if the input string matches this Regex */ public final boolean matches(String input) { return this.pattern.matcher(input).matches(); @@ -96,6 +98,8 @@ public final boolean matches(String input) { /** * Split a character sequence, removing instances of this Regex. + * @param input an input string to split + * @return a String array representing the split character sequences less the regex characters patterns */ public final String[] split(CharSequence input) { return this.pattern.split(input); @@ -105,6 +109,9 @@ public final String[] split(CharSequence input) { /** * Split a character sequence, removing instances of this Regex, up to a limited * number of segments. + * @param input an input string to split + * @param limit maximum number of splits + * @return a String array representing the split character sequences less the regex characters patterns */ public final String[] split(CharSequence input, int limit) { return this.pattern.split(input, limit); @@ -114,6 +121,9 @@ public final String[] split(CharSequence input, int limit) { /** * Replace all substrings of the input which match this Regex with the specified * replacement string. + * @param input an input string for which to make replacements + * @param replacement the replacement string + * @return a new replacement string */ public final String replaceAll(String input, String replacement) { return this.pattern.matcher(input).replaceAll(replacement); @@ -123,6 +133,9 @@ public final String replaceAll(String input, String replacement) { /** * Replace the first substring of the input which matches this Regex with the * specified replacement string. + * @param input the input string for replacement + * @param replacement the first substring of the input to replace + * @return the new string */ public final String replaceFirst(String input, String replacement) { return this.pattern.matcher(input).replaceFirst(replacement); diff --git a/src/joshua/util/ReverseOrder.java b/src/main/java/org/apache/joshua/util/ReverseOrder.java similarity index 97% rename from src/joshua/util/ReverseOrder.java rename to src/main/java/org/apache/joshua/util/ReverseOrder.java index 32b0c589..0270036c 100644 --- a/src/joshua/util/ReverseOrder.java +++ b/src/main/java/org/apache/joshua/util/ReverseOrder.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.Comparator; diff --git a/src/joshua/util/SampledList.java b/src/main/java/org/apache/joshua/util/SampledList.java similarity index 98% rename from src/joshua/util/SampledList.java rename to src/main/java/org/apache/joshua/util/SampledList.java index 0aab3bd3..60b0ef96 100644 --- a/src/joshua/util/SampledList.java +++ b/src/main/java/org/apache/joshua/util/SampledList.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.AbstractList; import java.util.List; diff --git a/src/joshua/util/SocketUtility.java b/src/main/java/org/apache/joshua/util/SocketUtility.java similarity index 92% rename from src/joshua/util/SocketUtility.java rename to src/main/java/org/apache/joshua/util/SocketUtility.java index db12a215..e219fd72 100644 --- a/src/joshua/util/SocketUtility.java +++ b/src/main/java/org/apache/joshua/util/SocketUtility.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.io.BufferedReader; import java.io.DataInputStream; @@ -31,10 +31,9 @@ import java.net.SocketTimeoutException; import java.net.UnknownHostException; - /** * - * @author Zhifei Li, + * @author Zhifei Li, zhifei.work@gmail.com * @version $LastChangedDate$ */ public class SocketUtility { @@ -65,15 +64,8 @@ public static ClientConnection open_connection_client(String hostname, int port) // res.data_out = new DataOutputStream(new BufferedOutputStream // (res.socket.getOutputStream())); - } catch (UnknownHostException e) { - System.out.println("unknown host exception"); - System.exit(1); - } catch (SocketTimeoutException e) { - System.out.println("socket timeout exception"); - System.exit(1); - } catch (IOException e) { - System.out.println("io exception"); - System.exit(1); + } catch ( IOException e) { + throw new RuntimeException(e); } return res; } diff --git a/src/joshua/util/StreamGobbler.java b/src/main/java/org/apache/joshua/util/StreamGobbler.java similarity index 97% rename from src/joshua/util/StreamGobbler.java rename to src/main/java/org/apache/joshua/util/StreamGobbler.java index 965d9266..7bb12caf 100644 --- a/src/joshua/util/StreamGobbler.java +++ b/src/main/java/org/apache/joshua/util/StreamGobbler.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.io.BufferedReader; import java.io.IOException; diff --git a/src/joshua/util/UnicodeCharacterName.java b/src/main/java/org/apache/joshua/util/UnicodeCharacterName.java similarity index 99% rename from src/joshua/util/UnicodeCharacterName.java rename to src/main/java/org/apache/joshua/util/UnicodeCharacterName.java index 06b4b888..93c759e5 100644 --- a/src/joshua/util/UnicodeCharacterName.java +++ b/src/main/java/org/apache/joshua/util/UnicodeCharacterName.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util; +package org.apache.joshua.util; import java.util.HashMap; import java.util.Map; diff --git a/src/joshua/util/encoding/Analyzer.java b/src/main/java/org/apache/joshua/util/encoding/Analyzer.java similarity index 98% rename from src/joshua/util/encoding/Analyzer.java rename to src/main/java/org/apache/joshua/util/encoding/Analyzer.java index e85c1339..ad2910c4 100644 --- a/src/joshua/util/encoding/Analyzer.java +++ b/src/main/java/org/apache/joshua/util/encoding/Analyzer.java @@ -16,14 +16,14 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.encoding; +package org.apache.joshua.util.encoding; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.TreeMap; -import joshua.util.io.LineReader; +import org.apache.joshua.util.io.LineReader; public class Analyzer { diff --git a/src/joshua/util/encoding/EightBitQuantizer.java b/src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java similarity index 98% rename from src/joshua/util/encoding/EightBitQuantizer.java rename to src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java index 2a8e0142..5876d4f9 100644 --- a/src/joshua/util/encoding/EightBitQuantizer.java +++ b/src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.encoding; +package org.apache.joshua.util.encoding; import java.io.DataInputStream; import java.io.DataOutputStream; diff --git a/src/joshua/util/encoding/EncoderConfiguration.java b/src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java similarity index 93% rename from src/joshua/util/encoding/EncoderConfiguration.java rename to src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java index 6cabf097..28b013fe 100644 --- a/src/joshua/util/encoding/EncoderConfiguration.java +++ b/src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.encoding; +package org.apache.joshua.util.encoding; import java.io.BufferedInputStream; import java.io.DataInputStream; @@ -27,7 +27,7 @@ import java.util.HashMap; import java.util.Map; -import joshua.corpus.Vocabulary; +import org.apache.joshua.corpus.Vocabulary; public class EncoderConfiguration { @@ -130,7 +130,7 @@ public boolean isLabeled() { /** * For now, this just loads a configuration and prints out the number of features. * - * @param args + * @param args an input configuration file */ public static void main(String[] args) { String grammar_dir = null; @@ -152,11 +152,9 @@ public static void main(String[] args) { } } catch (ArrayIndexOutOfBoundsException e) { - System.err.println("Usage: EncoderConfiguration "); - System.exit(1); + throw new RuntimeException("Usage: EncoderConfiguration "); } catch (IOException e) { - System.err.println(String.format("* FATAL: can't find file %s/encoding", grammar_dir)); - System.exit(1); + throw new RuntimeException(String.format("* FATAL: can't find file %s/encoding", grammar_dir)); } } } diff --git a/src/joshua/util/encoding/EncoderFactory.java b/src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java similarity index 97% rename from src/joshua/util/encoding/EncoderFactory.java rename to src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java index 1cb25e2f..a1f93d07 100644 --- a/src/joshua/util/encoding/EncoderFactory.java +++ b/src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.encoding; +package org.apache.joshua.util.encoding; public class EncoderFactory { diff --git a/src/joshua/util/encoding/FeatureTypeAnalyzer.java b/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java similarity index 91% rename from src/joshua/util/encoding/FeatureTypeAnalyzer.java rename to src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java index 4a8861c3..504859fb 100644 --- a/src/joshua/util/encoding/FeatureTypeAnalyzer.java +++ b/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.encoding; +package org.apache.joshua.util.encoding; import java.io.BufferedOutputStream; import java.io.DataOutputStream; @@ -28,14 +28,15 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.logging.Logger; -import joshua.corpus.Vocabulary; -import joshua.util.io.LineReader; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.util.io.LineReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class FeatureTypeAnalyzer { - private static final Logger logger = Logger.getLogger(FeatureTypeAnalyzer.class.getName()); + private static final Logger LOG = LoggerFactory.getLogger(FeatureTypeAnalyzer.class); private ArrayList types; @@ -74,8 +75,7 @@ public void readConfig(String config_filename) throws IOException { if ("encoder".equals(fields[0])) { // Adding an encoder to the mix. if (fields.length < 3) { - logger.severe("Incomplete encoder line in config."); - System.exit(0); + throw new RuntimeException("Incomplete encoder line in config."); } String encoder_key = fields[1]; ArrayList feature_ids = new ArrayList(); @@ -120,11 +120,15 @@ public void observe(int feature_id, float value) { // Inspects the collected histograms, inferring actual type of feature. Then replaces the // analyzer, if present, with the most compact applicable type. public void inferTypes(boolean labeled) { - for (FeatureType ft : types) + for (FeatureType ft : types) { ft.inferUncompressedType(); - for (int id : featureToType.keySet()) - logger.info("Type inferred: " + (labeled ? Vocabulary.word(id) : "Feature " + id) + " is " - + types.get(featureToType.get(id)).encoder.getKey()); + } + if (LOG.isInfoEnabled()) { + for (int id : featureToType.keySet()) { + LOG.info("Type inferred: {} is {}", (labeled ? Vocabulary.word(id) : "Feature " + id), + types.get(featureToType.get(id)).encoder.getKey()); + } + } } public void buildFeatureMap() { @@ -159,7 +163,7 @@ public void write(String file_name) throws IOException { DataOutputStream out_stream = new DataOutputStream(buf_stream); buildFeatureMap(); - + getIdEncoder().writeState(out_stream); out_stream.writeBoolean(labeled); out_stream.writeInt(types.size()); @@ -185,7 +189,7 @@ public String toString() { } return sb.toString(); } - + public boolean isLabeled() { return labeled; } diff --git a/src/joshua/util/encoding/FloatEncoder.java b/src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java similarity index 96% rename from src/joshua/util/encoding/FloatEncoder.java rename to src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java index 9841db33..5121ea25 100644 --- a/src/joshua/util/encoding/FloatEncoder.java +++ b/src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.encoding; +package org.apache.joshua.util.encoding; import java.io.DataInputStream; import java.io.DataOutputStream; diff --git a/src/joshua/util/encoding/IntEncoder.java b/src/main/java/org/apache/joshua/util/encoding/IntEncoder.java similarity index 96% rename from src/joshua/util/encoding/IntEncoder.java rename to src/main/java/org/apache/joshua/util/encoding/IntEncoder.java index 0c79ae88..a8917f7d 100644 --- a/src/joshua/util/encoding/IntEncoder.java +++ b/src/main/java/org/apache/joshua/util/encoding/IntEncoder.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.encoding; +package org.apache.joshua.util.encoding; import java.io.DataInputStream; import java.io.DataOutputStream; diff --git a/src/joshua/util/encoding/PrimitiveFloatEncoder.java b/src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java similarity index 98% rename from src/joshua/util/encoding/PrimitiveFloatEncoder.java rename to src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java index f43c29bc..d5015f27 100644 --- a/src/joshua/util/encoding/PrimitiveFloatEncoder.java +++ b/src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.encoding; +package org.apache.joshua.util.encoding; import java.io.DataInputStream; import java.io.DataOutputStream; diff --git a/src/joshua/util/encoding/PrimitiveIntEncoder.java b/src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java similarity index 98% rename from src/joshua/util/encoding/PrimitiveIntEncoder.java rename to src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java index 441d5f94..42f60539 100644 --- a/src/joshua/util/encoding/PrimitiveIntEncoder.java +++ b/src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.encoding; +package org.apache.joshua.util.encoding; import java.io.DataInputStream; import java.io.DataOutputStream; diff --git a/src/joshua/util/encoding/VariableQuantizer.java b/src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java similarity index 98% rename from src/joshua/util/encoding/VariableQuantizer.java rename to src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java index 42f09311..afa3f69e 100644 --- a/src/joshua/util/encoding/VariableQuantizer.java +++ b/src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.encoding; +package org.apache.joshua.util.encoding; public class VariableQuantizer { diff --git a/src/joshua/util/io/BinaryIn.java b/src/main/java/org/apache/joshua/util/io/BinaryIn.java similarity index 96% rename from src/joshua/util/io/BinaryIn.java rename to src/main/java/org/apache/joshua/util/io/BinaryIn.java index c6caf4fb..9483e3ed 100644 --- a/src/joshua/util/io/BinaryIn.java +++ b/src/main/java/org/apache/joshua/util/io/BinaryIn.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.io; +package org.apache.joshua.util.io; import java.io.DataInput; import java.io.Externalizable; @@ -26,10 +26,7 @@ import java.io.ObjectStreamConstants; import java.io.RandomAccessFile; -public class BinaryIn extends RandomAccessFile - implements - DataInput, - ObjectInput { +public class BinaryIn extends RandomAccessFile implements DataInput, ObjectInput { private final Class type; @@ -41,9 +38,7 @@ public BinaryIn(String filename, Class type) throws FileNotFoundException { public int available() throws IOException { long pos = getFilePointer(); long length = length(); - long bytesAvailable = length - pos; - if (bytesAvailable > Integer.MAX_VALUE) { return Integer.MAX_VALUE; } else { @@ -54,13 +49,9 @@ public int available() throws IOException { public E readObject() throws ClassNotFoundException, IOException { int b = peek(); - if (b == ObjectStreamConstants.TC_NULL) { - return null; - } else { - E obj; try { obj = type.newInstance(); @@ -71,8 +62,6 @@ public E readObject() throws ClassNotFoundException, IOException { } catch (IllegalAccessException e) { throw new RuntimeException(e); } - - } } @@ -93,8 +82,6 @@ public long skip(long n) throws IOException { return bytesSkipped; } - - private int peek() throws IOException { long pos = getFilePointer(); int b = read(); diff --git a/src/joshua/util/io/BinaryOut.java b/src/main/java/org/apache/joshua/util/io/BinaryOut.java similarity index 98% rename from src/joshua/util/io/BinaryOut.java rename to src/main/java/org/apache/joshua/util/io/BinaryOut.java index f5b96f24..83830539 100644 --- a/src/joshua/util/io/BinaryOut.java +++ b/src/main/java/org/apache/joshua/util/io/BinaryOut.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.io; +package org.apache.joshua.util.io; import java.io.Closeable; import java.io.DataOutput; @@ -31,7 +31,6 @@ import java.io.ObjectStreamConstants; import java.io.OutputStream; import java.io.UTFDataFormatException; -import java.util.logging.Logger; /** * A BinaryOut writes data to an output stream in raw binary form. Each data type is converted to @@ -45,8 +44,6 @@ */ public class BinaryOut implements DataOutput, ObjectOutput, Flushable, Closeable { - @SuppressWarnings("unused") - private static final Logger logger = Logger.getLogger(BinaryOut.class.getName()); public final int BITS_PER_BYTE = 8; @@ -97,8 +94,8 @@ public void close() throws IOException { * If necessary, the current contents of the buffer will be written to the underlying output * stream. * - * @param size - * @throws IOException + * @param size the size of the buffer + * @throws IOException if there is an error determining the current size */ protected void prepareBuffer(int size) throws IOException { if (bufferPosition > 0 && bufferPosition >= BUFFER_SIZE - size) { diff --git a/src/joshua/util/io/IndexedReader.java b/src/main/java/org/apache/joshua/util/io/IndexedReader.java similarity index 88% rename from src/joshua/util/io/IndexedReader.java rename to src/main/java/org/apache/joshua/util/io/IndexedReader.java index 07c251e2..f357e557 100644 --- a/src/joshua/util/io/IndexedReader.java +++ b/src/main/java/org/apache/joshua/util/io/IndexedReader.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.io; +package org.apache.joshua.util.io; import java.io.IOException; import java.util.Iterator; @@ -26,7 +26,7 @@ /** * Wraps a reader with "line" index information. * - * @author wren ng thornton + * @author wren ng thornton wren@users.sourceforge.net * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $ */ public class IndexedReader implements Reader { @@ -46,12 +46,10 @@ public IndexedReader(String elementName, Reader reader) { this.reader = reader; } - - // =============================================================== - // Public (non-interface) methods - // =============================================================== - - /** Return the number of elements delivered so far. */ + /** + * Return the number of elements delivered so far. + * @return integer representing the number of elements delivered so far + */ public int index() { return this.lineNumber; } @@ -59,6 +57,8 @@ public int index() { /** * Wrap an IOException's message with the index when it occured. + * @param oldError the old {@link java.io.IOException} we wish to wrap + * @return the new wrapped {@link java.io.IOException} */ public IOException wrapIOException(IOException oldError) { IOException newError = @@ -72,7 +72,12 @@ public IOException wrapIOException(IOException oldError) { // Reader // =============================================================== - /** Delegated to the underlying reader. */ + /** + * Delegated to the underlying reader. + * @return true if the reader is ready + * @throws IOException if there is an error determining readiness + */ + @Override public boolean ready() throws IOException { try { return this.reader.ready(); diff --git a/src/joshua/util/io/LineReader.java b/src/main/java/org/apache/joshua/util/io/LineReader.java similarity index 95% rename from src/joshua/util/io/LineReader.java rename to src/main/java/org/apache/joshua/util/io/LineReader.java index a4f9fe04..e61e79a4 100644 --- a/src/joshua/util/io/LineReader.java +++ b/src/main/java/org/apache/joshua/util/io/LineReader.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.io; +package org.apache.joshua.util.io; import java.io.BufferedReader; import java.io.FileDescriptor; @@ -31,14 +31,14 @@ import java.util.zip.GZIPInputStream; import java.util.zip.ZipException; -import joshua.decoder.Decoder; +import org.apache.joshua.decoder.Decoder; /** * This class provides an Iterator interface to a BufferedReader. This covers the most common * use-cases for reading from files without ugly code to check whether we got a line or not. * - * @author wren ng thornton - * @author Matt Post + * @author wren ng thornton wren@users.sourceforge.net + * @author Matt Post post@cs.jhu.edu */ public class LineReader implements Reader { @@ -74,6 +74,7 @@ public class LineReader implements Reader { * STDIN. GZIP'd files are tested for automatically. * * @param filename the file to be opened ("-" for STDIN) + * @throws IOException if there is an error reading the input file */ public LineReader(String filename) throws IOException { @@ -110,6 +111,7 @@ public LineReader(String filename, boolean show_progress) throws IOException { /** * Wraps an InputStream for iterating line by line. Stream encoding is assumed to be UTF-8. + * @param in an {@link java.io.InputStream} to wrap and iterate over line by line */ public LineReader(InputStream in) { this.reader = new BufferedReader(new InputStreamReader(in, FILE_ENCODING)); @@ -130,6 +132,7 @@ public int progress() { * iteration. The method is idempotent, and all calls after the first are no-ops (unless the * thread was interrupted or killed). For correctness, you must call this method before the * object falls out of scope. + * @throws IOException if there is an error closing the file handler */ public void close() throws IOException { @@ -322,6 +325,7 @@ public void remove() throws UnsupportedOperationException { * remain after calling this method, we implicitly call close. * * @return the number of lines read + * @throws IOException if there is an error reading lines */ public int countLines() throws IOException { int lines = 0; @@ -335,11 +339,10 @@ public int countLines() throws IOException { return lines; } - // =============================================================== - // Main - // =============================================================== - - /** Example usage code. */ + /** + * Example usage code. + * @param args an input file + */ public static void main(String[] args) { if (1 != args.length) { System.out.println("Usage: java LineReader filename"); diff --git a/src/joshua/util/io/NullReader.java b/src/main/java/org/apache/joshua/util/io/NullReader.java similarity index 93% rename from src/joshua/util/io/NullReader.java rename to src/main/java/org/apache/joshua/util/io/NullReader.java index 903557ef..f833f00c 100644 --- a/src/joshua/util/io/NullReader.java +++ b/src/main/java/org/apache/joshua/util/io/NullReader.java @@ -16,11 +16,11 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.io; +package org.apache.joshua.util.io; import java.io.IOException; -import joshua.util.NullIterator; +import org.apache.joshua.util.NullIterator; /** @@ -28,7 +28,7 @@ * have a {@link Reader}, and you don't want to check for null all the time. All operations are * no-ops. * - * @author wren ng thornton + * @author wren ng thornton wren@users.sourceforge.net * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $ */ public class NullReader extends NullIterator implements Reader { diff --git a/src/joshua/util/io/ProgressInputStream.java b/src/main/java/org/apache/joshua/util/io/ProgressInputStream.java similarity index 96% rename from src/joshua/util/io/ProgressInputStream.java rename to src/main/java/org/apache/joshua/util/io/ProgressInputStream.java index 8bdf6c44..075c0b3e 100644 --- a/src/joshua/util/io/ProgressInputStream.java +++ b/src/main/java/org/apache/joshua/util/io/ProgressInputStream.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.io; +package org.apache.joshua.util.io; import java.io.FilterInputStream; import java.io.IOException; @@ -26,7 +26,7 @@ * Generic progress meter for reading files (compressed or not). Pass it the raw input file stream * and it will keep track for you. * - * @author Matt Post + * @author Matt Post post@cs.jhu.edu */ public class ProgressInputStream extends FilterInputStream { diff --git a/src/joshua/util/io/Reader.java b/src/main/java/org/apache/joshua/util/io/Reader.java similarity index 66% rename from src/joshua/util/io/Reader.java rename to src/main/java/org/apache/joshua/util/io/Reader.java index 021cdd2e..cab6d743 100644 --- a/src/joshua/util/io/Reader.java +++ b/src/main/java/org/apache/joshua/util/io/Reader.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.util.io; +package org.apache.joshua.util.io; import java.io.IOException; import java.util.Iterator; @@ -24,17 +24,28 @@ /** * Common interface for Reader type objects. * - * @author wren ng thornton + * @author wren ng thornton wren@users.sourceforge.net * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $ */ public interface Reader extends Iterable, Iterator { - /** Close the reader, freeing all resources. */ + /** + * Close the reader, freeing all resources. + * @throws IOException if there is an error closing the reader instance + */ void close() throws IOException; - /** Determine if the reader is ready to read a line. */ + /** + * Determine if the reader is ready to read a line. + * @return true if it is ready + * @throws IOException if there is an error whilst determining if the reader if ready + */ boolean ready() throws IOException; - /** Read a "line" and return an object representing it. */ + /** + * Read a "line" and return an object representing it. + * @return an object representing a single line + * @throws IOException if there is an error reading lines + */ E readLine() throws IOException; } diff --git a/src/main/java/org/apache/joshua/util/io/package-info.java b/src/main/java/org/apache/joshua/util/io/package-info.java new file mode 100644 index 00000000..d7ea475c --- /dev/null +++ b/src/main/java/org/apache/joshua/util/io/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * Provides common utility classes for IO. + */ +package org.apache.joshua.util.io; diff --git a/src/main/java/org/apache/joshua/util/package-info.java b/src/main/java/org/apache/joshua/util/package-info.java new file mode 100644 index 00000000..2dedb376 --- /dev/null +++ b/src/main/java/org/apache/joshua/util/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * Provides common utility classes. + */ +package org.apache.joshua.util; diff --git a/src/main/java/org/apache/joshua/util/quantization/BooleanQuantizer.java b/src/main/java/org/apache/joshua/util/quantization/BooleanQuantizer.java new file mode 100644 index 00000000..accd933d --- /dev/null +++ b/src/main/java/org/apache/joshua/util/quantization/BooleanQuantizer.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.util.quantization; + +import java.nio.ByteBuffer; + +/** + * Standard quantizer for boolean types. + * + * @author jg + * + */ +public class BooleanQuantizer extends StatelessQuantizer { + + public final float read(ByteBuffer stream, int position) { + return 1.0f; + } + + public final void write(ByteBuffer stream, float value) {} + + @Override + public String getKey() { + return "boolean"; + } + + public final int size() { + return 0; + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/joshua/util/quantization/Quantizer.java b/src/main/java/org/apache/joshua/util/quantization/Quantizer.java new file mode 100644 index 00000000..33a4e9ad --- /dev/null +++ b/src/main/java/org/apache/joshua/util/quantization/Quantizer.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.util.quantization; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; + +public interface Quantizer { + + public float read(ByteBuffer stream, int position); + + public void write(ByteBuffer stream, float value); + + public void initialize(); + + public void add(float key); + + public void finalize(); + + public String getKey(); + + public void writeState(DataOutputStream out) throws IOException; + + public void readState(DataInputStream in) throws IOException; + + public int size(); +} \ No newline at end of file diff --git a/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java b/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java new file mode 100644 index 00000000..f4765f98 --- /dev/null +++ b/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.util.quantization; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.joshua.corpus.Vocabulary; + +public class QuantizerConfiguration { + + private static final Quantizer DEFAULT; + + private ArrayList quantizers; + private Map quantizerByFeatureId; + + static { + DEFAULT = new BooleanQuantizer(); + } + + public QuantizerConfiguration() { + quantizers = new ArrayList(); + quantizerByFeatureId = new HashMap(); + } + + public void add(String quantizer_key, List feature_ids) { + Quantizer q = QuantizerFactory.get(quantizer_key); + quantizers.add(q); + int index = quantizers.size() - 1; + for (int feature_id : feature_ids) + quantizerByFeatureId.put(feature_id, index); + } + + public void initialize() { + for (Quantizer q : quantizers) + q.initialize(); + } + + public void finalize() { + for (Quantizer q : quantizers) + q.finalize(); + } + + public final Quantizer get(int feature_id) { + Integer index = quantizerByFeatureId.get(feature_id); + return (index != null ? quantizers.get(index) : DEFAULT); + } + + public void read(String file_name) throws IOException { + quantizers.clear(); + quantizerByFeatureId.clear(); + + File quantizer_file = new File(file_name); + DataInputStream in_stream = + new DataInputStream(new BufferedInputStream(new FileInputStream(quantizer_file))); + int num_quantizers = in_stream.readInt(); + quantizers.ensureCapacity(num_quantizers); + for (int i = 0; i < num_quantizers; i++) { + String key = in_stream.readUTF(); + Quantizer q = QuantizerFactory.get(key); + q.readState(in_stream); + quantizers.add(q); + } + int num_mappings = in_stream.readInt(); + for (int i = 0; i < num_mappings; i++) { + String feature_name = in_stream.readUTF(); + int feature_id = Vocabulary.id(feature_name); + int quantizer_index = in_stream.readInt(); + if (quantizer_index >= num_quantizers) { + throw new RuntimeException("Error deserializing QuanitzerConfig. " + "Feature " + + feature_name + " referring to quantizer " + quantizer_index + " when only " + + num_quantizers + " known."); + } + this.quantizerByFeatureId.put(feature_id, quantizer_index); + } + in_stream.close(); + } + + public void write(String file_name) throws IOException { + File vocab_file = new File(file_name); + DataOutputStream out_stream = + new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file))); + out_stream.writeInt(quantizers.size()); + for (int index = 0; index < quantizers.size(); index++) + quantizers.get(index).writeState(out_stream); + out_stream.writeInt(quantizerByFeatureId.size()); + for (int feature_id : quantizerByFeatureId.keySet()) { + out_stream.writeUTF(Vocabulary.word(feature_id)); + out_stream.writeInt(quantizerByFeatureId.get(feature_id)); + } + out_stream.close(); + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/joshua/util/quantization/QuantizerFactory.java b/src/main/java/org/apache/joshua/util/quantization/QuantizerFactory.java new file mode 100644 index 00000000..687b1da0 --- /dev/null +++ b/src/main/java/org/apache/joshua/util/quantization/QuantizerFactory.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.util.quantization; + + +public class QuantizerFactory { + + public static Quantizer get(String key) { + if ("boolean".equals(key)) { + return new BooleanQuantizer(); + +// } else if ("byte".equals(key)) { +// return new ByteQuantizer(); +// +// } else if ("char".equals(key)) { +// return new CharQuantizer(); +// +// } else if ("short".equals(key)) { +// return new ShortQuantizer(); +// +// } else if ("float".equals(key)) { +// return new FloatQuantizer(); +// +// } else if ("int".equals(key)) { +// return new IntQuantizer(); +// +// } else if ("8bit".equals(key)) { +// return new EightBitQuantizer(); + + } else { + throw new RuntimeException("Unknown quantizer type: " + key); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java b/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java new file mode 100644 index 00000000..e81e9455 --- /dev/null +++ b/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.util.quantization; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +abstract class StatelessQuantizer implements Quantizer { + + public void initialize() {} + + public void add(float key) {} + + public void finalize() {} + + public void writeState(DataOutputStream out) throws IOException { + out.writeUTF(getKey()); + } + + public void readState(DataInputStream in) throws IOException {} +} \ No newline at end of file diff --git a/src/main/java/org/apache/joshua/util/quantization/package-info.java b/src/main/java/org/apache/joshua/util/quantization/package-info.java new file mode 100644 index 00000000..24185771 --- /dev/null +++ b/src/main/java/org/apache/joshua/util/quantization/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.util.quantization; \ No newline at end of file diff --git a/src/joshua/zmert/IntermediateOptimizer.java b/src/main/java/org/apache/joshua/zmert/IntermediateOptimizer.java similarity index 98% rename from src/joshua/zmert/IntermediateOptimizer.java rename to src/main/java/org/apache/joshua/zmert/IntermediateOptimizer.java index 68b2463d..333f937e 100644 --- a/src/joshua/zmert/IntermediateOptimizer.java +++ b/src/main/java/org/apache/joshua/zmert/IntermediateOptimizer.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.zmert; +package org.apache.joshua.zmert; import java.io.BufferedReader; import java.io.FileNotFoundException; @@ -31,7 +31,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Semaphore; -import joshua.metrics.EvaluationMetric; +import org.apache.joshua.metrics.EvaluationMetric; public class IntermediateOptimizer implements Runnable { /* non-static data members */ @@ -613,12 +613,8 @@ private void set_suffStats_array(TreeSet[] indicesOfInterest) { inFile.close(); - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in MertCore.initialize(int): " + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in MertCore.initialize(int): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } // set_suffStats_array(HashMap[] suffStats_array, TreeSet[] indicesOfInterest, Vector[] @@ -961,13 +957,7 @@ private void real_run() { } public void run() { - try { - real_run(); - } catch (Exception e) { - System.err.println("Exception in IntermediateOptimizer.run(): " + e.getMessage()); - e.printStackTrace(); - System.exit(99905); - } + real_run(); if (!strToPrint.equals("")) { threadOutput.add(strToPrint); } diff --git a/src/joshua/zmert/MertCore.java b/src/main/java/org/apache/joshua/zmert/MertCore.java similarity index 93% rename from src/joshua/zmert/MertCore.java rename to src/main/java/org/apache/joshua/zmert/MertCore.java index 0e96347c..c0d470db 100644 --- a/src/joshua/zmert/MertCore.java +++ b/src/main/java/org/apache/joshua/zmert/MertCore.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.zmert; +package org.apache.joshua.zmert; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -49,10 +49,12 @@ import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.metrics.EvaluationMetric; -import joshua.util.StreamGobbler; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.metrics.EvaluationMetric; +import org.apache.joshua.util.StreamGobbler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This code was originally written by Omar Zaidan. In September of 2012, it was augmented to support @@ -62,6 +64,9 @@ */ public class MertCore { + + private static final Logger LOG = LoggerFactory.getLogger(MertCore.class); + private final JoshuaConfiguration joshuaConfiguration; private TreeSet[] indicesOfInterest_all; @@ -293,8 +298,7 @@ private void initialize(int randsToSkip) { if (! new File(refFile).exists()) refFile = refFileName + ".0"; if (! new File(refFile).exists()) { - System.err.println(String.format("* FATAL: can't find first reference file '%s{0,.0}'", refFileName)); - System.exit(1); + throw new RuntimeException(String.format("* FATAL: can't find first reference file '%s{0,.0}'", refFileName)); } numSentences = countLines(refFile); @@ -341,12 +345,8 @@ private void initialize(int randsToSkip) { } inFile_names.close(); - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in MertCore.initialize(int): " + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in MertCore.initialize(int): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } processParamFile(); @@ -369,8 +369,7 @@ private void initialize(int randsToSkip) { if (! new File(refFile).exists()) refFile = refFileName + "." + i; if (! new File(refFile).exists()) { - System.err.println(String.format("* FATAL: can't find reference file '%s'", refFile)); - System.exit(1); + throw new RuntimeException(String.format("* FATAL: can't find reference file '%s'", refFile)); } reference_readers[i] = new BufferedReader(new InputStreamReader(new FileInputStream(new File(refFile)), "utf8")); @@ -397,12 +396,8 @@ private void initialize(int randsToSkip) { inFile_comm.close(); } } - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in MertCore.initialize(int): " + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in MertCore.initialize(int): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } @@ -1274,13 +1269,8 @@ public double[] run_single_iteration(int iteration, int minIts, int maxIts, int println("", 1); - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in MertCore.run_single_iteration(6): " - + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in MertCore.run_single_iteration(6): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } @@ -1313,9 +1303,7 @@ public double[] run_single_iteration(int iteration, int minIts, int maxIts, int try { blocker.acquire(initsPerIt); } catch (java.lang.InterruptedException e) { - System.err.println("InterruptedException in MertCore.run_single_iteration(): " - + e.getMessage()); - System.exit(99906); + throw new RuntimeException(e); } // extract output from threadOutput[] @@ -1479,21 +1467,15 @@ && fileExists(fakeFileNamePrefix + iteration + fakeFileNameSuffix)) { int decStatus = p.waitFor(); if (decStatus != validDecoderExitValue) { - println("Call to decoder returned " + decStatus + "; was expecting " + throw new RuntimeException("Call to decoder returned " + decStatus + "; was expecting " + validDecoderExitValue + "."); - System.exit(30); } - } catch (IOException e) { - System.err.println("IOException in MertCore.run_decoder(int): " + e.getMessage()); - System.exit(99902); - } catch (InterruptedException e) { - System.err.println("InterruptedException in MertCore.run_decoder(int): " + e.getMessage()); - System.exit(99903); + } catch (IOException| InterruptedException e) { + throw new RuntimeException(e); } retSA[0] = decoderOutFileName; retSA[1] = "1"; - } return retSA; @@ -1598,13 +1580,8 @@ private void produceTempFiles(String nbestFileName, int iteration) { gzipFile(featsFileName); } - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in MertCore.produceTempFiles(int): " - + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in MertCore.produceTempFiles(int): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -1640,9 +1617,7 @@ private void createConfigFile(double[] params, String cfgFileName, String templa inFile.close(); outFile.close(); } catch (IOException e) { - System.err.println("IOException in MertCore.createConfigFile(double[],String,String): " - + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -1652,8 +1627,7 @@ private void processParamFile() { try { inFile_init = new Scanner(new FileReader(paramsFileName)); } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in MertCore.processParamFile(): " + e.getMessage()); - System.exit(99901); + throw new RuntimeException("FileNotFoundException in MertCore.processParamFile(): " + e.getMessage()); } String dummy = ""; @@ -1676,8 +1650,8 @@ private void processParamFile() { } else if (dummy.equals("Fix")) { isOptimizable[c] = false; } else { - println("Unknown isOptimizable string " + dummy + " (must be either Opt or Fix)"); - System.exit(21); + throw new RuntimeException("Unknown isOptimizable string " + dummy + + " (must be either Opt or Fix)"); } if (!isOptimizable[c]) { // skip next four values @@ -1691,16 +1665,14 @@ private void processParamFile() { if (dummy.equals("-Inf")) { minThValue[c] = NegInf; } else if (dummy.equals("+Inf")) { - println("minThValue[" + c + "] cannot be +Inf!"); - System.exit(21); + throw new RuntimeException("minThValue[" + c + "] cannot be +Inf!"); } else { minThValue[c] = Double.parseDouble(dummy); } dummy = inFile_init.next(); if (dummy.equals("-Inf")) { - println("maxThValue[" + c + "] cannot be -Inf!"); - System.exit(21); + throw new RuntimeException("maxThValue[" + c + "] cannot be -Inf!"); } else if (dummy.equals("+Inf")) { maxThValue[c] = PosInf; } else { @@ -1710,16 +1682,14 @@ private void processParamFile() { // set minRandValue[c] and maxRandValue[c] (range for random values) dummy = inFile_init.next(); if (dummy.equals("-Inf") || dummy.equals("+Inf")) { - println("minRandValue[" + c + "] cannot be -Inf or +Inf!"); - System.exit(21); + throw new RuntimeException("minRandValue[" + c + "] cannot be -Inf or +Inf!"); } else { minRandValue[c] = Double.parseDouble(dummy); } dummy = inFile_init.next(); if (dummy.equals("-Inf") || dummy.equals("+Inf")) { - println("maxRandValue[" + c + "] cannot be -Inf or +Inf!"); - System.exit(21); + throw new RuntimeException("maxRandValue[" + c + "] cannot be -Inf or +Inf!"); } else { maxRandValue[c] = Double.parseDouble(dummy); } @@ -1727,14 +1697,12 @@ private void processParamFile() { // check for illogical values if (minThValue[c] > maxThValue[c]) { - println("minThValue[" + c + "]=" + minThValue[c] + " > " + maxThValue[c] + "=maxThValue[" - + c + "]!"); - System.exit(21); + throw new RuntimeException("minThValue[" + c + "]=" + minThValue[c] + + " > " + maxThValue[c] + "=maxThValue[" + c + "]!"); } if (minRandValue[c] > maxRandValue[c]) { - println("minRandValue[" + c + "]=" + minRandValue[c] + " > " + maxRandValue[c] - + "=maxRandValue[" + c + "]!"); - System.exit(21); + throw new RuntimeException("minRandValue[" + c + "]=" + minRandValue[c] + + " > " + maxRandValue[c] + "=maxRandValue[" + c + "]!"); } // check for odd values @@ -1804,40 +1772,34 @@ private void processParamFile() { normalizationOptions[2] = c_fromParamName(pName);; if (normalizationOptions[1] <= 0) { - println("Value for the absval normalization method must be positive."); - System.exit(21); + throw new RuntimeException("Value for the absval normalization method must be positive."); } if (normalizationOptions[2] == 0) { - println("Unrecognized feature name " + normalizationOptions[2] - + " for absval normalization method.", 1); - System.exit(21); + throw new RuntimeException("Unrecognized feature name " + normalizationOptions[2] + + " for absval normalization method."); } } else if (dummyA[0].equals("maxabsval")) { normalizationOptions[0] = 2; normalizationOptions[1] = Double.parseDouble(dummyA[1]); if (normalizationOptions[1] <= 0) { - println("Value for the maxabsval normalization method must be positive."); - System.exit(21); + throw new RuntimeException("Value for the maxabsval normalization method must be positive."); } } else if (dummyA[0].equals("minabsval")) { normalizationOptions[0] = 3; normalizationOptions[1] = Double.parseDouble(dummyA[1]); if (normalizationOptions[1] <= 0) { - println("Value for the minabsval normalization method must be positive."); - System.exit(21); + throw new RuntimeException("Value for the minabsval normalization method must be positive."); } } else if (dummyA[0].equals("LNorm")) { normalizationOptions[0] = 4; normalizationOptions[1] = Double.parseDouble(dummyA[1]); normalizationOptions[2] = Double.parseDouble(dummyA[2]); if (normalizationOptions[1] <= 0 || normalizationOptions[2] <= 0) { - println("Both values for the LNorm normalization method must be positive."); - System.exit(21); + throw new RuntimeException("Both values for the LNorm normalization method must be positive."); } } else { - println("Unrecognized normalization method " + dummyA[0] + "; " + throw new RuntimeException("Unrecognized normalization method " + dummyA[0] + "; " + "must be one of none, absval, maxabsval, and LNorm."); - System.exit(21); } // if (dummyA[0]) inFile_init.close(); @@ -1947,12 +1909,8 @@ private void processDocInfo() { } - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in MertCore.processDocInfo(): " + e.getMessage()); - System.exit(99901); } catch (IOException e) { - System.err.println("IOException in MertCore.processDocInfo(): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -1987,12 +1945,8 @@ private boolean copyFile(String origFileName, String newFileName) { * inFile.close(); outFile.close(); */ return true; - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException in MertCore.copyFile(String,String): " - + e.getMessage()); - return false; } catch (IOException e) { - System.err.println("IOException in MertCore.copyFile(String,String): " + e.getMessage()); + LOG.error(e.getMessage(), e); return false; } } @@ -2052,8 +2006,7 @@ public void finish() { outFile_lambdas.close(); } catch (IOException e) { - System.err.println("IOException in MertCore.finish(): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -2105,13 +2058,9 @@ private String[] cfgFileToArgsArray(String fileName) { inFile.close(); } catch (FileNotFoundException e) { - println("Z-MERT configuration file " + fileName + " was not found!"); - System.err.println("FileNotFoundException in MertCore.cfgFileToArgsArray(String): " - + e.getMessage()); - System.exit(99901); + throw new RuntimeException("Z-MERT configuration file " + fileName + " was not found!", e); } catch (IOException e) { - System.err.println("IOException in MertCore.cfgFileToArgsArray(String): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } String[] argsArray = new String[argsVector.size()]; @@ -2194,14 +2143,12 @@ private void processArgsArray(String[] args, boolean firstTime) { } else if (option.equals("-rps")) { refsPerSen = Integer.parseInt(args[i + 1]); if (refsPerSen < 1) { - println("refsPerSen must be positive."); - System.exit(10); + throw new RuntimeException("refsPerSen must be positive."); } } else if (option.equals("-txtNrm")) { textNormMethod = Integer.parseInt(args[i + 1]); if (textNormMethod < 0 || textNormMethod > 4) { - println("textNormMethod should be between 0 and 4"); - System.exit(10); + throw new RuntimeException("textNormMethod should be between 0 and 4"); } } else if (option.equals("-p")) { paramsFileName = args[i + 1]; @@ -2221,8 +2168,7 @@ private void processArgsArray(String[] args, boolean firstTime) { } i += optionCount; } else { - println("Unknown metric name " + metricName + "."); - System.exit(10); + throw new RuntimeException("Unknown metric name " + metricName + "."); } } else if (option.equals("-docSet")) { String method = args[i + 1]; @@ -2267,32 +2213,27 @@ private void processArgsArray(String[] args, boolean firstTime) { docSubsetInfo[6] = Integer.parseInt(a2); i += 3; } else { - println("Unknown docSet method " + method + "."); - System.exit(10); + throw new RuntimeException("Unknown docSet method " + method + "."); } } else if (option.equals("-maxIt")) { maxMERTIterations = Integer.parseInt(args[i + 1]); if (maxMERTIterations < 1) { - println("maxMERTIts must be positive."); - System.exit(10); + throw new RuntimeException("maxMERTIts must be positive."); } } else if (option.equals("-minIt")) { minMERTIterations = Integer.parseInt(args[i + 1]); if (minMERTIterations < 1) { - println("minMERTIts must be positive."); - System.exit(10); + throw new RuntimeException("minMERTIts must be positive."); } } else if (option.equals("-prevIt")) { prevMERTIterations = Integer.parseInt(args[i + 1]); if (prevMERTIterations < 0) { - println("prevMERTIts must be non-negative."); - System.exit(10); + throw new RuntimeException("prevMERTIts must be non-negative."); } } else if (option.equals("-stopIt")) { stopMinIts = Integer.parseInt(args[i + 1]); if (stopMinIts < 1) { - println("stopMinIts must be positive."); - System.exit(10); + throw new RuntimeException("stopMinIts must be positive."); } } else if (option.equals("-stopSig")) { stopSigValue = Double.parseDouble(args[i + 1]); @@ -2303,26 +2244,22 @@ private void processArgsArray(String[] args, boolean firstTime) { else if (option.equals("-thrCnt")) { numOptThreads = Integer.parseInt(args[i + 1]); if (numOptThreads < 1) { - println("threadCount must be positive."); - System.exit(10); + throw new RuntimeException("threadCount must be positive."); } } else if (option.equals("-save")) { saveInterFiles = Integer.parseInt(args[i + 1]); if (saveInterFiles < 0 || saveInterFiles > 3) { - println("save should be between 0 and 3"); - System.exit(10); + throw new RuntimeException("save should be between 0 and 3"); } } else if (option.equals("-compress")) { compressFiles = Integer.parseInt(args[i + 1]); if (compressFiles < 0 || compressFiles > 1) { - println("compressFiles should be either 0 or 1"); - System.exit(10); + throw new RuntimeException("compressFiles should be either 0 or 1"); } } else if (option.equals("-ipi")) { initsPerIt = Integer.parseInt(args[i + 1]); if (initsPerIt < 1) { - println("initsPerIt must be positive."); - System.exit(10); + throw new RuntimeException("initsPerIt must be positive."); } } else if (option.equals("-opi")) { int opi = Integer.parseInt(args[i + 1]); @@ -2331,8 +2268,7 @@ else if (option.equals("-thrCnt")) { } else if (opi == 0) { oneModificationPerIteration = false; } else { - println("oncePerIt must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("oncePerIt must be either 0 or 1."); } } else if (option.equals("-rand")) { int rand = Integer.parseInt(args[i + 1]); @@ -2341,8 +2277,7 @@ else if (option.equals("-thrCnt")) { } else if (rand == 0) { randInit = false; } else { - println("randInit must be either 0 or 1."); - System.exit(10); + throw new RuntimeException("randInit must be either 0 or 1."); } } else if (option.equals("-seed")) { if (args[i + 1].equals("time")) { @@ -2361,8 +2296,7 @@ else if (option.equals("-cmd")) { } else if (option.equals("-passIt")) { int val = Integer.parseInt(args[i + 1]); if (val < 0 || val > 1) { - println("passIterationToDecoder should be either 0 or 1"); - System.exit(10); + throw new RuntimeException("passIterationToDecoder should be either 0 or 1"); } passIterationToDecoder = (val == 1) ? true : false; } else if (option.equals("-decOut")) { @@ -2374,44 +2308,38 @@ else if (option.equals("-cmd")) { } else if (option.equals("-N")) { sizeOfNBest = Integer.parseInt(args[i + 1]); if (sizeOfNBest < 1) { - println("N must be positive."); - System.exit(10); + throw new RuntimeException("N must be positive."); } } // Output specs else if (option.equals("-v")) { verbosity = Integer.parseInt(args[i + 1]); if (verbosity < 0 || verbosity > 4) { - println("verbosity should be between 0 and 4"); - System.exit(10); + throw new RuntimeException("verbosity should be between 0 and 4"); } } else if (option.equals("-decV")) { decVerbosity = Integer.parseInt(args[i + 1]); if (decVerbosity < 0 || decVerbosity > 1) { - println("decVerbosity should be either 0 or 1"); - System.exit(10); + throw new RuntimeException("decVerbosity should be either 0 or 1"); } } else if (option.equals("-fake")) { fakeFileNameTemplate = args[i + 1]; int QM_i = fakeFileNameTemplate.indexOf("?"); if (QM_i <= 0) { - println("fakeFileNameTemplate must contain '?' to indicate position of iteration number"); - System.exit(10); + throw new RuntimeException("fakeFileNameTemplate must contain '?' to indicate position of iteration number"); } fakeFileNamePrefix = fakeFileNameTemplate.substring(0, QM_i); fakeFileNameSuffix = fakeFileNameTemplate.substring(QM_i + 1); } else if (option.equals("-damianos")) { damianos_method = Integer.parseInt(args[i + 1]); if (damianos_method < 0 || damianos_method > 3) { - println("damianos_method should be between 0 and 3"); - System.exit(10); + throw new RuntimeException("damianos_method should be between 0 and 3"); } damianos_param = Double.parseDouble(args[i + 2]); damianos_mult = Double.parseDouble(args[i + 3]); i += 2; } else { - println("Unknown option " + option); - System.exit(10); + throw new RuntimeException("Unknown option " + option); } i += 2; @@ -2483,10 +2411,10 @@ else if (option.equals("-v")) { if (!canRunCommand && !canRunJoshua) { // can only run fake decoder if (!canRunFake) { - println("Z-MERT cannot decode; must provide one of: command file (for external decoder),"); - println(" source file (for Joshua decoder),"); - println(" or prefix for existing output files (for fake decoder)."); - System.exit(12); + String msg = "Z-MERT cannot decode; must provide one of: command file " + + "(for external decoder) source file (for Joshua decoder)," + + " or prefix for existing output files (for fake decoder)."; + throw new RuntimeException(msg); } int lastGoodIt = 0; @@ -2499,9 +2427,8 @@ else if (option.equals("-v")) { } if (lastGoodIt == 0) { - println("Fake decoder cannot find first output file " + throw new RuntimeException("Fake decoder cannot find first output file " + (fakeFileNamePrefix + 1 + fakeFileNameSuffix)); - System.exit(13); } else if (lastGoodIt < maxMERTIterations) { if (firstTime) println("Warning: can only run fake decoder; existing output files " @@ -2583,8 +2510,7 @@ private void set_docSubsetInfo(int[] info) { private void checkFile(String fileName) { if (!fileExists(fileName)) { - println("The file " + fileName + " was not found!"); - System.exit(40); + throw new RuntimeException("The file " + fileName + " was not found!"); } } @@ -2618,8 +2544,7 @@ private void gzipFile(String inputFileName, String gzippedFileName) { deleteFile(inputFileName); } catch (IOException e) { - System.err.println("IOException in MertCore.gzipFile(String,String): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } } @@ -2777,8 +2702,7 @@ private int countLines(String fileName) { inFile.close(); } catch (IOException e) { - System.err.println("IOException in MertCore.countLines(String): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } return count; @@ -2798,8 +2722,7 @@ private int countNonEmptyLines(String fileName) { inFile.close(); } catch (IOException e) { - System.err.println("IOException in MertCore.countNonEmptyLines(String): " + e.getMessage()); - System.exit(99902); + throw new RuntimeException(e); } return count; diff --git a/src/joshua/zmert/ZMERT.java b/src/main/java/org/apache/joshua/zmert/ZMERT.java similarity index 98% rename from src/joshua/zmert/ZMERT.java rename to src/main/java/org/apache/joshua/zmert/ZMERT.java index 45f79db8..45f8334f 100644 --- a/src/joshua/zmert/ZMERT.java +++ b/src/main/java/org/apache/joshua/zmert/ZMERT.java @@ -16,13 +16,13 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.zmert; +package org.apache.joshua.zmert; import java.io.BufferedReader; import java.io.InputStreamReader; -import joshua.decoder.JoshuaConfiguration; -import joshua.util.FileUtility; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.util.FileUtility; public class ZMERT { public static void main(String[] args) throws Exception { diff --git a/src/main/java/org/apache/joshua/zmert/package-info.java b/src/main/java/org/apache/joshua/zmert/package-info.java new file mode 100644 index 00000000..571b5240 --- /dev/null +++ b/src/main/java/org/apache/joshua/zmert/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * Provides code for performing minimum error rate training. + * Much of the code in this package is based on Och (2003). + * A deeper description of the algorithm is in Zaidan (2009). + */ +package org.apache.joshua.zmert; diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties new file mode 100644 index 00000000..67b4d3e0 --- /dev/null +++ b/src/main/resources/log4j.properties @@ -0,0 +1,5 @@ +# log4j settings +log4j.rootLogger=DEBUG, stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.SimpleLayout diff --git a/src/test/java/org/apache/joshua/corpus/CorpusArrayTest.java b/src/test/java/org/apache/joshua/corpus/CorpusArrayTest.java new file mode 100644 index 00000000..de6f32e5 --- /dev/null +++ b/src/test/java/org/apache/joshua/corpus/CorpusArrayTest.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.corpus; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Date; +import java.util.logging.Logger; + +//import org.apache.joshua.corpus.CorpusArray; +import org.apache.joshua.corpus.Phrase; +//import org.apache.joshua.corpus.mm.MemoryMappedCorpusArray; +//import org.apache.joshua.corpus.suffix_array.SuffixArrayFactory; +import org.apache.joshua.corpus.Vocabulary; +//import org.apache.joshua.util.FormatUtil; + +import org.testng.Assert; +import org.testng.annotations.Test; + +public class CorpusArrayTest { + + /** Logger for this class. */ + private static Logger logger = + Logger.getLogger(CorpusArrayTest.class.getName()); +} + +// @Test +// public void writePartsToDisk() { +// +// String filename = "data/tiny.en"; +// int numSentences = 5; // Should be 5 sentences in tiny.en +// int numWords = 89; // Should be 89 words in tiny.en +// +// +// try { +// +// // FIX: can't use createVocabulary(String) because we set numWords and numSentences +// Vocabulary vocab = new Vocabulary(); +// SuffixArrayFactory.createVocabulary(filename, vocab); +// Corpus corpus = SuffixArrayFactory.createCorpusArray(filename, vocab, numWords, numSentences); +// +// corpus.writeWordIDsToFile(filename+".bin"); +// corpus.writeSentenceLengthsToFile(filename+".sbin"); +// +// MemoryMappedCorpusArray mmCorpus = new MemoryMappedCorpusArray(corpus.getVocabulary(), filename+".bin", numWords*4, filename+".sbin", numSentences*4); +// +// // For each word in the corpus, +// for (int i=0; i, , , -pau-*/ + int numBuiltInSymbols = 9; + + /** , , , -pau- */ + int numBuiltInTerminals = 4; + + @Test + public void basicVocabTest() { + + Vocabulary vocab1 = new Vocabulary(); + Vocabulary vocab2 = new Vocabulary(); + + Assert.assertEquals(vocab1, vocab2); + + Assert.assertFalse(vocab1.size() == 0); + //Assert.assertTrue(vocab1.intToString.get(0)==Vocabulary.UNKNOWN_WORD_STRING); + //Assert.assertFalse(vocab1.getWords().isEmpty()); + // Assert.assertTrue(vocab1.getWords(0)==Vocabulary.UNKNOWN_WORD_STRING); + // Assert.assertEquals(vocab1.getWords(), vocab1.intToString.values()); + + Assert.assertNotEquals(vocab1.size(), numBuiltInSymbols); + // Assert.assertEquals(vocab1.getWord(Vocabulary.UNKNOWN_WORD), Vocabulary.UNKNOWN_WORD_STRING); + + //Assert.assertEquals(vocab1.getID("sample"), Vocabulary.UNKNOWN_WORD); + //Assert.assertEquals(vocab1.getID(null), Vocabulary.UNKNOWN_WORD); + + // Assert.assertFalse(vocab1.terminalToInt.isEmpty()); + // Assert.assertEquals(vocab1.terminalToInt.size(), this.numBuiltInTerminals); + // Assert.assertFalse(vocab1.isFixed); + // + // vocab1.fixVocabulary(); + // Assert.assertTrue(vocab1.isFixed); + + // Assert.assertEquals(vocab1.getID(Vocabulary.X_STRING), -1); + // Assert.assertEquals(vocab1.getID(Vocabulary.X1_STRING), -2); + // Assert.assertEquals(vocab1.getID(Vocabulary.X2_STRING), -3); + // + // Assert.assertEquals(vocab1.getWord(-1), Vocabulary.X_STRING); + // Assert.assertEquals(vocab1.getWord(-2), Vocabulary.X1_STRING); + // Assert.assertEquals(vocab1.getWord(-3), Vocabulary.X2_STRING); + + + + // Assert.assertFalse(vocab2.intToString.isEmpty()); + // Assert.assertTrue(vocab2.intToString.get(0)==Vocabulary.UNKNOWN_WORD_STRING); + // Assert.assertFalse(vocab2.getWords().isEmpty()); + // Assert.assertTrue(vocab2.getWord(0)==Vocabulary.UNKNOWN_WORD_STRING); + // Assert.assertEquals(vocab2.getWords(), vocab2.intToString.values()); + + Assert.assertNotEquals(vocab2.size(), numBuiltInSymbols); + // Assert.assertEquals(vocab2.getWord(Vocabulary.UNKNOWN_WORD), Vocabulary.UNKNOWN_WORD_STRING); + + // Assert.assertEquals(vocab2.getID("sample"), Vocabulary.UNKNOWN_WORD); + // Assert.assertEquals(vocab2.getID(null), Vocabulary.UNKNOWN_WORD); + + // Assert.assertFalse(vocab2.terminalToInt.isEmpty()); + // Assert.assertEquals(vocab2.terminalToInt.size(), this.numBuiltInTerminals); + // Assert.assertTrue(vocab2.isFixed); + } + + @Test + public void verifyWordIDs() throws IOException { + + // Adam Lopez's example... + String corpusString = "it makes him and it mars him , it sets him on and it takes him off ."; + // String queryString = "it persuades him and it disheartens him"; + + String sourceFileName; + { + File sourceFile = File.createTempFile("source", new Date().toString()); + PrintStream sourcePrintStream = new PrintStream(sourceFile, "UTF-8"); + sourcePrintStream.println(corpusString); + sourcePrintStream.close(); + sourceFileName = sourceFile.getAbsolutePath(); + } + + Vocabulary vocab = new Vocabulary(); + // Vocabulary.initializeVocabulary(sourceFileName, vocab, true); + +// Assert.assertEquals(vocab.getWords(Vocabulary.id("it")), "it"); +// Assert.assertEquals(vocab.getWord(vocab.getID("makes")), "makes"); +// Assert.assertEquals(vocab.getWord(vocab.getID("him")), "him"); +// Assert.assertEquals(vocab.getWord(vocab.getID("and")), "and"); +// Assert.assertEquals(vocab.getWord(vocab.getID("mars")), "mars"); +// Assert.assertEquals(vocab.getWord(vocab.getID(",")), ","); +// Assert.assertEquals(vocab.getWord(vocab.getID("sets")), "sets"); +// Assert.assertEquals(vocab.getWord(vocab.getID("on")), "on"); +// Assert.assertEquals(vocab.getWord(vocab.getID("takes")), "takes"); +// Assert.assertEquals(vocab.getWord(vocab.getID("off")), "off"); + + // Assert.assertEquals(vocab.getWord(vocab.getID("persuades")), Vocabulary.UNKNOWN_WORD_STRING); + // Assert.assertEquals(vocab.getWord(vocab.getID("disheartens")), Vocabulary.UNKNOWN_WORD_STRING); + } + + @SuppressWarnings("static-access") + @Test(enabled=false) + public void loadVocabFromFile() { + + String filename = "data/tiny.en"; + int numSentences = 5; // Should be 5 sentences in tiny.en + int numWords = 89; // Should be 89 words in tiny.en + int numUniqWords = 60; // Should be 60 unique words in tiny.en + + Vocabulary vocab = new Vocabulary(); + Vocabulary vocab2 = new Vocabulary(); + + Assert.assertTrue(vocab.equals(vocab2)); + Assert.assertTrue(vocab2.equals(vocab)); + Assert.assertEquals(vocab, vocab2); + + try { + vocab.read(new File(getClass().getClassLoader().getResource(filename).getFile())); + //int[] result = Vocabulary.initializeVocabulary(filename, vocab, true); + Assert.assertNotNull(vocab); + Assert.assertEquals(vocab.size(), 2); + //Assert.assertEquals(vocab.getWords(numWords), numWords); + // Assert.assertEquals(result[1], numSentences); + + //Assert.assertTrue(vocab.isFixed); + Assert.assertEquals(Vocabulary.size(), numUniqWords+numBuiltInSymbols); + + } catch (IOException e) { + Assert.fail("Error processing " + filename +"; Reason: " + e); + } + + Assert.assertFalse(vocab.equals(vocab2)); + + try { + vocab2.read(new File(filename)); + //int[] result = Vocabulary.initializeVocabulary(filename, vocab2, true); + Assert.assertNotNull(vocab2); + Assert.assertEquals(vocab2.size(), 2); + // Assert.assertEquals(result[0], numWords); + // Assert.assertEquals(result[1], numSentences); + + // Assert.assertTrue(vocab2.isFixed); + Assert.assertEquals(Vocabulary.size(), numUniqWords+numBuiltInSymbols); + + } catch (IOException e) { + Assert.fail("Could not load file " + filename); + } + + Assert.assertEquals(vocab, vocab2); + } +} diff --git a/test/joshua/decoder/ArtificialGrammarAndCorpusCreater.java b/src/test/java/org/apache/joshua/decoder/ArtificialGrammarAndCorpusCreater.java similarity index 83% rename from test/joshua/decoder/ArtificialGrammarAndCorpusCreater.java rename to src/test/java/org/apache/joshua/decoder/ArtificialGrammarAndCorpusCreater.java index 55b97fef..5cc5996a 100644 --- a/test/joshua/decoder/ArtificialGrammarAndCorpusCreater.java +++ b/src/test/java/org/apache/joshua/decoder/ArtificialGrammarAndCorpusCreater.java @@ -1,4 +1,22 @@ -package joshua.decoder; +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder; import java.io.BufferedWriter; import java.io.FileWriter; @@ -7,7 +25,7 @@ import java.util.Arrays; import java.util.List; -import joshua.util.FileUtility; +import org.apache.joshua.util.FileUtility; public class ArtificialGrammarAndCorpusCreater { @@ -95,9 +113,9 @@ protected final void writeMainGrammar(boolean includeInvertingNonterminalRule) { ruleList = getArtificalGrammarsList2(); } else{ - ruleList = getArtificalGrammarsList1(); + ruleList = getArtificalGrammarsList1(); } - + writeFile(mainGrammarFilePath,ruleList); } diff --git a/src/test/java/org/apache/joshua/decoder/DecoderThreadTest.java b/src/test/java/org/apache/joshua/decoder/DecoderThreadTest.java new file mode 100644 index 00000000..ed49c2a4 --- /dev/null +++ b/src/test/java/org/apache/joshua/decoder/DecoderThreadTest.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Date; +import java.util.Scanner; + +import org.apache.joshua.corpus.Corpus; +import org.apache.joshua.corpus.Vocabulary; + +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Unit tests for decoder thread. + * + * @author Lane Schwartz + * @version $LastChangedDate$ + */ +public class DecoderThreadTest { + + @Test + public void setup() { + + String[] sourceSentences = { + "a b c d", + "a b c d", + "a b c d" + }; + + String[] targetSentences = { + "w x y z", + "w t u v", + "s x y z" + }; + + String[] alignmentLines = { + "0-0 1-1 2-2 3-3", + "0-0 1-1 2-2 3-3", + "0-0 1-1 2-2 3-3" + }; + + String[] testSentences = { + "a b c" + }; + + try { + + // Set up source corpus + File sourceFile = File.createTempFile("source", new Date().toString()); + PrintStream sourcePrintStream = new PrintStream(sourceFile, "UTF-8"); + for (String sentence : sourceSentences) { + sourcePrintStream.println(sentence); + } + sourcePrintStream.close(); + String sourceCorpusFileName = sourceFile.getAbsolutePath(); + +// Vocabulary vocabulary = new Vocabulary(); +// int[] sourceLengths = Vocabulary.initializeVocabulary(sourceCorpusFileName, vocabulary, true); +// Assert.assertEquals(sourceLengths.length, 2); +// int numberOfSentences = sourceLengths[1]; +// +// Corpus sourceCorpus = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, vocabulary, sourceLengths[0], sourceLengths[1]); + + + // Set up target corpus + File targetFile = File.createTempFile("target", new Date().toString()); + PrintStream targetPrintStream = new PrintStream(targetFile, "UTF-8"); + for (String sentence : targetSentences) { + targetPrintStream.println(sentence); + } + targetPrintStream.close(); + String targetCorpusFileName = targetFile.getAbsolutePath(); + +// int[] targetLengths = Vocabulary.initializeVocabulary(targetCorpusFileName, vocabulary, true); +// Assert.assertEquals(targetLengths.length, sourceLengths.length); +// for (int i=0, n=targetLengths.length; i counts = new HashMap(); + + boolean iterationOccurred = false; + + for (ArpaNgram ngram : arpaFile) { + + iterationOccurred = true; + + int order = ngram.order(); + // System.err.println("Order = " + order); + + int count; + if (counts.containsKey(order)) { + count = counts.get(order) + 1; + } else { + count = 1; + } + + counts.put(order, count); + + } + + Assert.assertTrue(iterationOccurred); + + Assert.assertTrue(counts.containsKey(1)); + Assert.assertTrue(counts.containsKey(2)); + Assert.assertTrue(counts.containsKey(3)); + + Assert.assertEquals((int) counts.get(1), 8); + Assert.assertEquals((int) counts.get(2), 5); + Assert.assertEquals((int) counts.get(3), 1); + + } + + @Test(dependsOnMethods = { "setup" }) + public void testSize() { + ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab); + + Assert.assertEquals(arpaFile.size(), 14); + } + + @Test(dependsOnMethods = { "setup", "testIteration" }) + public void testChildren() throws FileNotFoundException { + ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab); + + TrieLM lm = new TrieLM(arpaFile); + // System.err.println(lm.getChildren().size()); + Assert.assertNotSame(lm.getChildren().size(), 0); + } + + @Test(dependsOnMethods = { "setup", "testIteration", "testChildren" }) + public void testTrie() throws FileNotFoundException { + ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab); + + TrieLM lm = new TrieLM(arpaFile); + + testLm(lm); + + } + + @Test(dependsOnMethods = { "setup", "testIteration", "testChildren" }) + public void testBerkeley() throws FileNotFoundException { + + LMGrammarBerkeley lm = new LMGrammarBerkeley(3, arpaFileName); + + testLm(lm); + + } + + /** + * @param lm + */ + private void testLm(NGramLanguageModel lm) { + // Test unigrams known to be in the language model +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("a")), -1.992672, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because")), -2.713723, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("boycott")), -4.678545, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of")), -1.609573, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("parliament")), -3.875917, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("potato")), -9.753210, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("resumption")), -4.678545, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the")), -1.712444, 0.000001f); + + // Test unigrams known to NOT be in the language model +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("banana")), -JoshuaConfiguration.lm_ceiling_cost, 0.000001f); + + // Test bigrams known to be in the language model +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because of")), -0.3552987, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of the")), -0.7507797, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("resumption of")), -0.7266324, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the resumption")), -3.936147, 0.000001f); + + // Test trigrams known to be in the language model +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because of the")), -0.6309999f, 0.000001f); + + // Test bigrams know to NOT be in the language model (but the unigrams are) +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("a boycott")), -4.678545f + -0.1195484f, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of parliament")), -3.875917f + -0.1991907f, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the potato")), -9.753210f + -0.1606644f, 0.000001f); +// Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("potato parliament")), -3.875917f + -0.0f, 0.000001f); + + // Test trigrams know to NOT be in the language model (but the bigrams are) +// int[] words = vocab.getIDs("because of a"); +// double f = lm.ngramLogProbability(words); +// Assert.assertEquals(f, -1.403534f + -0.03083654f, 0.000001f); + // //Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of the parliament")), -3.875917f + -0.05237135f, 0.000001f); + } +} diff --git a/tst/joshua/decoder/ff/lm/LanguageModelFFTest.java b/src/test/java/org/apache/joshua/decoder/ff/lm/LanguageModelFFTest.java similarity index 88% rename from tst/joshua/decoder/ff/lm/LanguageModelFFTest.java rename to src/test/java/org/apache/joshua/decoder/ff/lm/LanguageModelFFTest.java index 83f5397c..f762e316 100644 --- a/tst/joshua/decoder/ff/lm/LanguageModelFFTest.java +++ b/src/test/java/org/apache/joshua/decoder/ff/lm/LanguageModelFFTest.java @@ -16,19 +16,20 @@ * specific language governing permissions and limitations * under the License. */ - package joshua.decoder.ff.lm; +package org.apache.joshua.decoder.ff.lm; import static org.junit.Assert.*; +import static org.hamcrest.CoreMatchers.*; import org.junit.After; import org.junit.Before; import org.junit.Test; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.ff.state_maintenance.NgramDPState; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState; public class LanguageModelFFTest { @@ -81,7 +82,7 @@ public void givenOnlyStartSymbol_whenEstimateFutureCost_thenZeroResult() { @Test public void givenStartAndOneMoreSymbol_whenEstimateFutureCost_thenMultipleWeightAndLogProbabilty() { int startSymbolId = Vocabulary.id(Vocabulary.START_SYM); - assertNotEquals(startSymbolId, 3); + assertThat(startSymbolId, not(equalTo(3))); int[] left = {startSymbolId, 3}; NgramDPState currentState = new NgramDPState(left, new int[left.length]); diff --git a/tst/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java b/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java similarity index 50% rename from tst/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java rename to src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java index 74a832e7..bcc10399 100644 --- a/tst/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java +++ b/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java @@ -1,4 +1,22 @@ -package joshua.decoder.ff.lm.berkeley_lm; +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.ff.lm.berkeley_lm; import static org.junit.Assert.assertEquals; diff --git a/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java b/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java similarity index 57% rename from tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java rename to src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java index 6e0d90f8..df73136e 100644 --- a/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java +++ b/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java @@ -1,4 +1,22 @@ -package joshua.decoder.ff.lm.berkeley_lm; +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.ff.lm.berkeley_lm; import static org.junit.Assert.assertEquals; @@ -9,18 +27,17 @@ import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameter; import org.junit.runners.Parameterized.Parameters; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.Translation; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.Translation; +import org.apache.joshua.decoder.segment_file.Sentence; /** * Replacement for test/lm/berkeley/test.sh regression test */ -@RunWith(Parameterized.class) +@RunWith(value = Parameterized.class) public class LMGrammarBerkeleyTest { private static final String INPUT = "the chat-rooms"; @@ -42,7 +59,7 @@ public void tearDown() throws Exception { decoder.cleanUp(); } - @Parameter + //TODO @Parameters public String lmFile; @Test diff --git a/test/joshua/decoder/io/DeNormalizeTest.java b/src/test/java/org/apache/joshua/decoder/io/DeNormalizeTest.java similarity index 90% rename from test/joshua/decoder/io/DeNormalizeTest.java rename to src/test/java/org/apache/joshua/decoder/io/DeNormalizeTest.java index 9f3a404a..88b23502 100644 --- a/test/joshua/decoder/io/DeNormalizeTest.java +++ b/src/test/java/org/apache/joshua/decoder/io/DeNormalizeTest.java @@ -1,4 +1,22 @@ -package joshua.decoder.io; +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.io; import static org.testng.Assert.assertEquals; diff --git a/test/joshua/decoder/io/TranslationRequestTest.java b/src/test/java/org/apache/joshua/decoder/io/TranslationRequestTest.java similarity index 62% rename from test/joshua/decoder/io/TranslationRequestTest.java rename to src/test/java/org/apache/joshua/decoder/io/TranslationRequestTest.java index 5a3aacdc..5a1c3ab7 100644 --- a/test/joshua/decoder/io/TranslationRequestTest.java +++ b/src/test/java/org/apache/joshua/decoder/io/TranslationRequestTest.java @@ -1,9 +1,30 @@ -package joshua.decoder.io; +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.io; +import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.Charset; -import joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.JoshuaConfiguration; import org.testng.annotations.*; import static org.testng.Assert.*; @@ -55,7 +76,8 @@ public void testTranslationRequest() { @Test(enabled = true) public void testSize_uponConstruction() { InputStream in = mock(InputStream.class); - TranslationRequest request = new TranslationRequest(in, joshuaConfiguration); + TranslationRequestStream request = new TranslationRequestStream( + new BufferedReader(new InputStreamReader(in, Charset.defaultCharset())), joshuaConfiguration); assertEquals(request.size(), 0); } @@ -67,7 +89,8 @@ public void testSize_uponConstruction() { public void testSize_1() throws Exception { byte[] data = "1".getBytes(); ByteArrayInputStream input = new ByteArrayInputStream(data); - TranslationRequest request = new TranslationRequest(input, joshuaConfiguration); + TranslationRequestStream request = new TranslationRequestStream( + new BufferedReader(new InputStreamReader(input, Charset.defaultCharset())), joshuaConfiguration); request.next(); assertEquals(request.size(), 1); } @@ -80,7 +103,8 @@ public void testSize_1() throws Exception { public void testSize_newline() throws Exception { byte[] data = "\n".getBytes(); ByteArrayInputStream input = new ByteArrayInputStream(data); - TranslationRequest request = new TranslationRequest(input, joshuaConfiguration); + TranslationRequestStream request = new TranslationRequestStream( + new BufferedReader(new InputStreamReader(input, Charset.defaultCharset())), joshuaConfiguration); request.next(); assertEquals(request.size(), 1); } @@ -93,7 +117,8 @@ public void testSize_newline() throws Exception { public void testSize_2newlines() throws Exception { byte[] data = "\n\n".getBytes(); ByteArrayInputStream input = new ByteArrayInputStream(data); - TranslationRequest request = new TranslationRequest(input, joshuaConfiguration); + TranslationRequestStream request = new TranslationRequestStream( + new BufferedReader(new InputStreamReader(input, Charset.defaultCharset())), joshuaConfiguration); request.next(); request.next(); assertEquals(request.size(), 2); @@ -107,7 +132,8 @@ public void testSize_2newlines() throws Exception { public void testNext_2Newlines() throws Exception { byte[] data = "\n\n".getBytes(); ByteArrayInputStream input = new ByteArrayInputStream(data); - TranslationRequest request = new TranslationRequest(input, joshuaConfiguration); + TranslationRequestStream request = new TranslationRequestStream( + new BufferedReader(new InputStreamReader(input, Charset.defaultCharset())), joshuaConfiguration); assertEquals(request.next().source(), ""); assertEquals(request.next().source(), ""); } diff --git a/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java b/src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java similarity index 86% rename from tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java rename to src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java index 26c503a1..c2cb0316 100644 --- a/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java +++ b/src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package joshua.decoder.kbest_extraction; + package org.apache.joshua.decoder.kbest_extraction; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.Translation; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.Translation; +import org.apache.joshua.decoder.segment_file.Sentence; import org.junit.After; import org.junit.Before; @@ -34,7 +34,7 @@ import static com.google.common.base.Charsets.UTF_8; import static java.nio.file.Files.readAllBytes; -import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES; +import static org.apache.joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES; import static org.junit.Assert.assertEquals; /** diff --git a/src/joshua/decoder/phrase/CoverageTest.java b/src/test/java/org/apache/joshua/decoder/phrase/CoverageTest.java similarity index 98% rename from src/joshua/decoder/phrase/CoverageTest.java rename to src/test/java/org/apache/joshua/decoder/phrase/CoverageTest.java index 90bcbafe..7526b1f2 100644 --- a/src/joshua/decoder/phrase/CoverageTest.java +++ b/src/test/java/org/apache/joshua/decoder/phrase/CoverageTest.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package joshua.decoder.phrase; +package org.apache.joshua.decoder.phrase; import static org.junit.Assert.*; diff --git a/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java b/src/test/java/org/apache/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java similarity index 86% rename from tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java rename to src/test/java/org/apache/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java index 6abfbe2f..4612b446 100644 --- a/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java +++ b/src/test/java/org/apache/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package joshua.decoder.phrase.constrained; + package org.apache.joshua.decoder.phrase.constrained; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.Translation; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.Translation; +import org.apache.joshua.decoder.segment_file.Sentence; import org.junit.After; import org.junit.Before; @@ -34,7 +34,7 @@ import static com.google.common.base.Charsets.UTF_8; import static java.nio.file.Files.readAllBytes; -import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES; +import static org.apache.joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES; import static org.junit.Assert.assertEquals; /** diff --git a/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java b/src/test/java/org/apache/joshua/decoder/phrase/decode/PhraseDecodingTest.java similarity index 86% rename from tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java rename to src/test/java/org/apache/joshua/decoder/phrase/decode/PhraseDecodingTest.java index 4785aff5..12891ee1 100644 --- a/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java +++ b/src/test/java/org/apache/joshua/decoder/phrase/decode/PhraseDecodingTest.java @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package joshua.decoder.phrase.decode; + package org.apache.joshua.decoder.phrase.decode; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.Translation; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.Translation; +import org.apache.joshua.decoder.segment_file.Sentence; import org.junit.After; import org.junit.Before; @@ -34,7 +34,7 @@ import static com.google.common.base.Charsets.UTF_8; import static java.nio.file.Files.readAllBytes; -import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES; +import static org.apache.joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES; import static org.junit.Assert.assertEquals; /** diff --git a/test/joshua/decoder/segment_file/AlmostTooLongSentenceTest.java b/src/test/java/org/apache/joshua/decoder/segment_file/AlmostTooLongSentenceTest.java similarity index 69% rename from test/joshua/decoder/segment_file/AlmostTooLongSentenceTest.java rename to src/test/java/org/apache/joshua/decoder/segment_file/AlmostTooLongSentenceTest.java index 3ecb31e0..3b2852c8 100644 --- a/test/joshua/decoder/segment_file/AlmostTooLongSentenceTest.java +++ b/src/test/java/org/apache/joshua/decoder/segment_file/AlmostTooLongSentenceTest.java @@ -1,4 +1,22 @@ -package joshua.decoder.segment_file; +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.segment_file; import org.testng.annotations.Test; @@ -6,7 +24,7 @@ import org.testng.annotations.AfterMethod; import static org.testng.Assert.*; -import joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.JoshuaConfiguration; public class AlmostTooLongSentenceTest { private JoshuaConfiguration joshuaConfiguration; diff --git a/test/joshua/decoder/segment_file/SentenceTest.java b/src/test/java/org/apache/joshua/decoder/segment_file/SentenceTest.java similarity index 62% rename from test/joshua/decoder/segment_file/SentenceTest.java rename to src/test/java/org/apache/joshua/decoder/segment_file/SentenceTest.java index cdacc3ea..8e0d171f 100644 --- a/test/joshua/decoder/segment_file/SentenceTest.java +++ b/src/test/java/org/apache/joshua/decoder/segment_file/SentenceTest.java @@ -1,6 +1,24 @@ -package joshua.decoder.segment_file; +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.decoder.segment_file; -import joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.JoshuaConfiguration; import org.testng.annotations.Test; import org.testng.annotations.BeforeMethod; @@ -56,11 +74,12 @@ private String concatTokens(String repeatedToken, int repeatedTimes) { } /** - * The too long input sentence should be replaced with an empty string. + * The too long input sentence should be truncated from 799 to 202 characters + * TODO is this a bug? maxlen is defined as 200 not 202 characters */ @Test - public void testTooManyTokensSourceOnlyEmpty() { - assertTrue(new Sentence(this.tooLongInput, 0, joshuaConfiguration).isEmpty()); + public void testTooManyTokensSourceTruncated() { + assertTrue(new Sentence(this.tooLongInput, 0, joshuaConfiguration).length() == 202); } @Test @@ -75,9 +94,9 @@ public void testTooManyTokensSourceAndTargetIsEmpty() { } @Test - public void testTooManyTokensSourceAndTargetEmptyString() { + public void testTooManyTokensSourceAndTargetTruncated() { Sentence sentence = new Sentence(this.tooLongInput + " ||| target side", 0, joshuaConfiguration); - assertTrue(sentence.isEmpty()); + assertTrue(sentence.length() == 202); } @Test diff --git a/src/test/java/org/apache/joshua/lattice/ArcTest.java b/src/test/java/org/apache/joshua/lattice/ArcTest.java new file mode 100644 index 00000000..a26a5934 --- /dev/null +++ b/src/test/java/org/apache/joshua/lattice/ArcTest.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.lattice; + +import org.apache.joshua.lattice.Arc; + +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Unit tests for Arc class. + * + * @author Lane Schwartz + * @since 2008-07-09 + * @version $LastChangedDate$ + */ +@Test(groups = { "lattice_arc" }) +public class ArcTest { + + private final Node head = new Node(1); + private final Node tail = new Node(2); + private final float cost = (float) Math.PI; + private final String label = "pi"; + + private Arc arc; + + @Test(dependsOnMethods = { "org.apache.joshua.lattice.NodeTest.constructNode" }) + //@Test(dependsOnGroups = {"lattice_node" }) + public void constructArc() { + + arc = new Arc(tail, head, (float)cost, label); + + Assert.assertEquals(arc.getHead(), head); + Assert.assertEquals(arc.getTail(), tail); + Assert.assertEquals(arc.getCost(), cost); + Assert.assertEquals(arc.getLabel(), label); + + } + + @Test(dependsOnMethods = { "constructArc" }) + public void getHead() { + + Assert.assertEquals(arc.getHead(), head); + + } + + + @Test(dependsOnMethods = { "constructArc" }) + public void getTail() { + + Assert.assertEquals(arc.getTail(), tail); + + } + + + @Test(dependsOnMethods = { "constructArc" }) + public void getCost() { + + Assert.assertEquals(arc.getCost(), cost); + + } + + + @Test(dependsOnMethods = { "constructArc" }) + public void getLabel() { + + Assert.assertEquals(arc.getLabel(), label); + + } +} diff --git a/src/test/java/org/apache/joshua/lattice/LatticeTest.java b/src/test/java/org/apache/joshua/lattice/LatticeTest.java new file mode 100644 index 00000000..1522120e --- /dev/null +++ b/src/test/java/org/apache/joshua/lattice/LatticeTest.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.lattice; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Unit tests for Lattice class. + * + * @author Lane Schwartz + * @since 2008-07-09 + * @version $LastChangedDate$ + */ +@Test(groups = { "lattice" }) +public class LatticeTest { + + @Test + public void allPairsShortestPath() { + + List> nodes = new ArrayList>(); + for (int i=0; i<4; i++) { + nodes.add(new Node(i)); + } + + nodes.get(0).addArc(nodes.get(1), (float) 1.0, "x"); + nodes.get(1).addArc(nodes.get(2), (float) 1.0, "y"); + nodes.get(0).addArc(nodes.get(2), (float) 1.5, "a"); + nodes.get(2).addArc(nodes.get(3), (float) 3.0, "b"); + nodes.get(2).addArc(nodes.get(3), (float) 5.0, "c"); + + Lattice graph = new Lattice(nodes, new JoshuaConfiguration()); + + Assert.assertEquals(graph.getShortestPath(0, 1), 1); + Assert.assertEquals(graph.getShortestPath(0, 2), 1); + Assert.assertEquals(graph.getShortestPath(1, 2), 1); + Assert.assertEquals(graph.getShortestPath(0, 3), 2); + Assert.assertEquals(graph.getShortestPath(1, 3), 2); + Assert.assertEquals(graph.getShortestPath(2, 3), 1); + } + + @Test + public void createFromString() { + + String data = + + // Start of lattice + "("+ + + // Node 0 + "("+ + "('A',1.0,5),"+ // Arc with label A and cost 1.0. Destination is Node 5 (Node 0 + span of 5) + "('B',1.0,2),"+ // Arc with label B and cost 1.0. Destination is Node 2 (Node 0 + span of 2) + "('C',1.0,3),"+ // Arc with label C and cost 1.0. Destination is Node 3 (Node 0 + span of 3) + "('D',1.0,1),"+ // Arc with label D and cost 1.0. Destination is Node 1 (Node 0 + span of 1) + ")," + + + // Node 1 + "(" + + "('E',1.0,4)," + // Arc with label E and cost 1.0. Destination is Node 5 (Node 1 + span of 4) + ")," + + + // Node 2 + "(" + + "('C',1.0,3)," + // Arc with label C and cost 1.0. Destination is Node 5 (Node 2 + span of 3) + ")," + + + // Node 3 + "(" + + "('D',1.0,1)," + // Arc with label D and cost 1.0. Destination is Node 4 (Node 3 + span of 1) + ")," + + + // Node 4 + "(" + + "('E',1.0,1)," + // Arc with label E and cost 1.0. Destination is Node 5 (Node 4 + span of 1) + ")," + + + // Node 5 + "(" + + "('X',1.0,1)," + // Arc with label X and cost 1.0. Destination is Node 6 (Node 5 + span of 1) + ")," + + + // There is an implicit final state (Node 6). + + ")"; // End of lattice + + + Lattice lattice = Lattice.createFromString(data); + + int numberOfNodes = 7; + + Assert.assertEquals(lattice.size(), numberOfNodes); + + Node node0 = lattice.getNode(0); + Node node1 = lattice.getNode(1); + Node node2 = lattice.getNode(2); + Node node3 = lattice.getNode(3); + Node node4 = lattice.getNode(4); + Node node5 = lattice.getNode(5); + Node node6 = lattice.getNode(6); + + Assert.assertEquals(node0.size(), 4); + Assert.assertEquals(node1.size(), 1); + Assert.assertEquals(node2.size(), 1); + Assert.assertEquals(node3.size(), 1); + Assert.assertEquals(node4.size(), 1); + Assert.assertEquals(node5.size(), 1); + Assert.assertEquals(node6.size(), 0); + + // Node 0 outgoing arcs + + Arc arcA_0_5 = node0.getOutgoingArcs().get(0); + Assert.assertEquals(arcA_0_5.getLabel(), "A"); + Assert.assertEquals(arcA_0_5.getHead(), node5); + Assert.assertEquals(arcA_0_5.getTail(), node0); + + Assert.assertEquals(arcA_0_5.getCost(), (float) 1.0); + + Arc arcB_0_2 = node0.getOutgoingArcs().get(1); + Assert.assertEquals(arcB_0_2.getLabel(), "B"); + Assert.assertEquals(arcB_0_2.getHead(), node2); + Assert.assertEquals(arcB_0_2.getTail(), node0); + Assert.assertEquals(arcB_0_2.getCost(), (float) 1.0); + + Arc arcC_0_3 = node0.getOutgoingArcs().get(2); + Assert.assertEquals(arcC_0_3.getLabel(), "C"); + Assert.assertEquals(arcC_0_3.getHead(), node3); + Assert.assertEquals(arcC_0_3.getTail(), node0); + Assert.assertEquals(arcC_0_3.getCost(), (float) 1.0); + + Arc arcD_0_1 = node0.getOutgoingArcs().get(3); + Assert.assertEquals(arcD_0_1.getLabel(), "D"); + Assert.assertEquals(arcD_0_1.getHead(), node1); + Assert.assertEquals(arcD_0_1.getTail(), node0); + Assert.assertEquals(arcD_0_1.getCost(), (float) 1.0); + + // Node 1 outgoing arcs + Arc arcE_1_5 = node1.getOutgoingArcs().get(0); + Assert.assertEquals(arcE_1_5.getLabel(), "E"); + Assert.assertEquals(arcE_1_5.getHead(), node5); + Assert.assertEquals(arcE_1_5.getTail(), node1); + Assert.assertEquals(arcE_1_5.getCost(), (float) 1.0); + + // Node 2 outgoing arcs + Arc arcC_2_5 = node2.getOutgoingArcs().get(0); + Assert.assertEquals(arcC_2_5.getLabel(), "C"); + Assert.assertEquals(arcC_2_5.getHead(), node5); + Assert.assertEquals(arcC_2_5.getTail(), node2); + Assert.assertEquals(arcC_2_5.getCost(), (float) 1.0); + + // Node 3 outgoing arcs + Arc arcD_3_4 = node3.getOutgoingArcs().get(0); + Assert.assertEquals(arcD_3_4.getLabel(), "D"); + Assert.assertEquals(arcD_3_4.getHead(), node4); + Assert.assertEquals(arcD_3_4.getTail(), node3); + Assert.assertEquals(arcD_3_4.getCost(), (float) 1.0); + + // Node 4 outgoing arcs + Arc arcE_4_5 = node4.getOutgoingArcs().get(0); + Assert.assertEquals(arcE_4_5.getLabel(), "E"); + Assert.assertEquals(arcE_4_5.getHead(), node5); + Assert.assertEquals(arcE_4_5.getTail(), node4); + Assert.assertEquals(arcE_1_5.getCost(), (float) 1.0); + + // Node 5 outgoing arcs + Arc arcX_5_6 = node5.getOutgoingArcs().get(0); + Assert.assertEquals(arcX_5_6.getLabel(), "X"); + Assert.assertEquals(arcX_5_6.getHead(), node6); + Assert.assertEquals(arcX_5_6.getTail(), node5); + Assert.assertEquals(arcX_5_6.getCost(), (float) 1.0); + } +} diff --git a/src/test/java/org/apache/joshua/lattice/NodeTest.java b/src/test/java/org/apache/joshua/lattice/NodeTest.java new file mode 100644 index 00000000..b58ba1e6 --- /dev/null +++ b/src/test/java/org/apache/joshua/lattice/NodeTest.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.lattice; + +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Unit tests for Node class. + * + * @author Lane Schwartz + * @since 2008-07-09 + * @version $LastChangedDate$ + */ +@Test(groups = { "lattice_node" }) +public class NodeTest { + + private final int id = 12345; + + private Node node; + + @Test + public void constructNode() { + node = new Node(id); + Assert.assertEquals((int) node.id(), (int) id); + Assert.assertTrue(node.getOutgoingArcs().isEmpty()); + Assert.assertEquals(node.size(), 0); + } + + + @Test(dependsOnMethods = { "constructNode" }) + public void getNumber() { + + Assert.assertEquals(node.getNumber(), id); + + } + + + @Test(dependsOnMethods = { "constructNode" }) + public void toStringTest() { + + Assert.assertEquals(node.toString(), "Node-"+id); + + } + + + @Test(dependsOnMethods = { "constructNode" }) + public void addArc() { + + Node n2 = new Node(2); + float w2 = (float) 0.123; + String l2 = "somthing cool"; + + Node n3 = new Node(3); + float w3 = (float) 124.78; + String l3 = "hurray!"; + + Node n4 = new Node(4); + float w4 = (float) Double.POSITIVE_INFINITY; + String l4 = "\u0000"; + + Assert.assertEquals(node.size(), 0); + + node.addArc(n2,(float) w2, l2); + Assert.assertEquals(node.size(), 1); + Arc a2 = node.getOutgoingArcs().get(0); + Assert.assertEquals(a2.getHead(), n2); + Assert.assertEquals(a2.getTail(), node); + Assert.assertEquals(a2.getCost(), w2); + Assert.assertEquals(a2.getLabel(), l2); + + node.addArc(n3,(float) w3, l3); + Assert.assertEquals(node.size(), 2); + Arc a3 = node.getOutgoingArcs().get(1); + Assert.assertEquals(a3.getHead(), n3); + Assert.assertEquals(a3.getTail(), node); + Assert.assertEquals(a3.getCost(), w3); + Assert.assertEquals(a3.getLabel(), l3); + + node.addArc(n4, (float) w4, l4); + Assert.assertEquals(node.size(), 3); + Arc a4 = node.getOutgoingArcs().get(2); + Assert.assertEquals(a4.getHead(), n4); + Assert.assertEquals(a4.getTail(), node); + Assert.assertEquals(a4.getCost(), w4); + Assert.assertEquals(a4.getLabel(), l4); + + } +} diff --git a/src/test/java/org/apache/joshua/packed/Benchmark.java b/src/test/java/org/apache/joshua/packed/Benchmark.java new file mode 100644 index 00000000..41cf2a0c --- /dev/null +++ b/src/test/java/org/apache/joshua/packed/Benchmark.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.packed; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.IntBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.util.Random; + +/** + * This program runs a little benchmark to check reading speed on various data + * representations. + * + * Usage: java Benchmark PACKED_GRAMMAR_DIR TIMES + */ + +public class Benchmark { + + + private static final Logger LOG = LoggerFactory.getLogger(Benchmark.class); + + private IntBuffer intBuffer; + private MappedByteBuffer byteBuffer; + private int[] intArray; + + public Benchmark(String dir) throws IOException { + File file = new File(dir + "/slice_00000.source"); + + FileChannel source_channel = new FileInputStream(file).getChannel(); + int byte_size = (int) source_channel.size(); + int int_size = byte_size / 4; + + byteBuffer = source_channel.map(MapMode.READ_ONLY, 0, byte_size); + intBuffer = byteBuffer.asIntBuffer(); + + intArray = new int[int_size]; + intBuffer.get(intArray); + } + + public void benchmark(int times) { + LOG.info("Beginning benchmark."); + + Random r = new Random(); + r.setSeed(1234567890); + int[] positions = new int[1000]; + for (int i = 0; i < positions.length; i++) + positions[i] = r.nextInt(intArray.length); + + long sum; + + long start_time = System.currentTimeMillis(); + + sum = 0; + for (int t = 0; t < times; t++) + for (int i = 0; i < positions.length; i++) + sum += byteBuffer.getInt(positions[i] * 4); + LOG.info("Sum: {}", sum); + long byte_time = System.currentTimeMillis(); + + sum = 0; + for (int t = 0; t < times; t++) + for (int i = 0; i < positions.length; i++) + sum += intBuffer.get(positions[i]); + LOG.info("Sum: {}", sum); + long int_time = System.currentTimeMillis(); + + sum = 0; + for (int t = 0; t < times; t++) + for (int i = 0; i < positions.length; i++) + sum += intArray[positions[i]]; + LOG.info("Sum: {}", sum); + long array_time = System.currentTimeMillis(); + + sum = 0; + for (int t = 0; t < times; t++) + for (int i = 0; i < (intArray.length / 8); i++) + sum += intArray[i * 6] + intArray[i * 6 + 2]; + LOG.info("Sum: {}", sum); + long mult_time = System.currentTimeMillis(); + + sum = 0; + for (int t = 0; t < times; t++) { + int index = 0; + for (int i = 0; i < (intArray.length / 8); i++) { + sum += intArray[index] + intArray[index + 2]; + index += 6; + } + } + LOG.info("Sum: {}", sum); + long add_time = System.currentTimeMillis(); + + LOG.info("ByteBuffer: {}", (byte_time - start_time)); + LOG.info("IntBuffer: {}", (int_time - byte_time)); + LOG.info("Array: {}", (array_time - int_time)); + LOG.info("Multiply: {}", (mult_time - array_time)); + LOG.info("Add: {}", (add_time - mult_time)); + } + + public static void main(String args[]) throws IOException { + Benchmark pr = new Benchmark(args[0]); + pr.benchmark( Integer.parseInt(args[1])); + } +} diff --git a/src/test/java/org/apache/joshua/packed/CountRules.java b/src/test/java/org/apache/joshua/packed/CountRules.java new file mode 100644 index 00000000..5ada5ab8 --- /dev/null +++ b/src/test/java/org/apache/joshua/packed/CountRules.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.packed; + +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; + +import org.apache.joshua.corpus.Vocabulary; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; + +/** + * This program reads a packed representation and prints out some + * basic information about it. + * + * Usage: java CountRules PACKED_GRAMMAR_DIR + */ + +public class CountRules { + + public static void main(String args[]) { + + String dir = args[0]; + + File file = new File(dir + "/chunk_00000.source"); + FileInputStream stream = null; + FileChannel channel = null; + try { + // read the vocabulary + Vocabulary.read(new File(dir + "/vocabulary")); + + // get the channel etc + stream = new FileInputStream(file); + channel = stream.getChannel(); + int size = (int) channel.size(); + + MappedByteBuffer buffer = channel.map(MapMode.READ_ONLY, 0, size); + // byte[] bytes = new bytes[size]; + // buffer.get(bytes); + + // read the number of rules + int numRules = buffer.getInt(); + System.out.println(String.format("There are %d source sides at the root", numRules)); + + // read the first symbol and its offset + for (int i = 0; i < numRules; i++) { + // String symbol = Vocabulary.word(buffer.getInt()); + int symbol = buffer.getInt(); + String string = Vocabulary.word(symbol); + int offset = buffer.getInt(); + System.out.println(String.format("-> %s/%d [%d]", string, symbol, offset)); + } + + } catch (IOException e) { + + e.printStackTrace(); + + } finally { + try { + if (stream != null) + stream.close(); + + if (channel != null) + channel.close(); + + } catch (IOException e) { + + e.printStackTrace(); + + } + } + + + // // Read in the bytes + // int offset = 0; + // int numRead = 0; + // while (offset < bytes.length + // && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) { + // offset += numRead; + // } + + // // Ensure all the bytes have been read in + // if (offset < bytes.length) { + // throw new IOException("Could not completely read file "+file.getName()); + // } + + // // Close the input stream and return bytes + // is.close(); + // return bytes; + } +} diff --git a/src/test/java/org/apache/joshua/packed/PrintRules.java b/src/test/java/org/apache/joshua/packed/PrintRules.java new file mode 100644 index 00000000..af6507f4 --- /dev/null +++ b/src/test/java/org/apache/joshua/packed/PrintRules.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.packed; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.IntBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; + +import org.apache.joshua.util.quantization.Quantizer; +import org.apache.joshua.util.quantization.QuantizerConfiguration; +import org.apache.joshua.corpus.Vocabulary; + +/** + * This program reads a packed representation and prints out some basic + * information about it. + * + * Usage: java PrintRules PACKED_GRAMMAR_DIR + */ + +public class PrintRules { + + private QuantizerConfiguration quantization; + + private int[] source; + private int[] target; + private MappedByteBuffer features; + private MappedByteBuffer alignments; + + private int[] featureLookup; + private int[] alignmentLookup; + + private boolean have_alignments; + + public PrintRules(String dir) throws IOException { + File source_file = new File(dir + "/slice_00000.source"); + File target_file = new File(dir + "/slice_00000.target"); + File feature_file = new File(dir + "/slice_00000.features"); + File alignment_file = new File(dir + "/slice_00000.alignments"); + + have_alignments = alignment_file.exists(); + + // Read the vocabulary. + Vocabulary.read(new File(dir + "/vocabulary")); + + // Read the quantizer setup. + quantization = new QuantizerConfiguration(); + quantization.read(dir + "/quantization"); + + // Get the channels etc. + @SuppressWarnings("resource") + FileChannel source_channel = new FileInputStream(source_file).getChannel(); + int source_size = (int) source_channel.size(); + IntBuffer source_buffer = source_channel.map(MapMode.READ_ONLY, 0, + source_size).asIntBuffer(); + source = new int[source_size / 4]; + source_buffer.get(source); + + @SuppressWarnings("resource") + FileChannel target_channel = new FileInputStream(target_file).getChannel(); + int target_size = (int) target_channel.size(); + IntBuffer target_buffer = target_channel.map(MapMode.READ_ONLY, 0, + target_size).asIntBuffer(); + target = new int[target_size / 4]; + target_buffer.get(target); + + @SuppressWarnings("resource") + FileChannel feature_channel = new FileInputStream(feature_file).getChannel(); + int feature_size = (int) feature_channel.size(); + features = feature_channel.map(MapMode.READ_ONLY, 0, feature_size); + + if (have_alignments) { + @SuppressWarnings("resource") + FileChannel alignment_channel = new FileInputStream(alignment_file).getChannel(); + int alignment_size = (int) alignment_channel.size(); + alignments = alignment_channel.map(MapMode.READ_ONLY, 0, alignment_size); + } + + int num_feature_blocks = features.getInt(); + featureLookup = new int[num_feature_blocks]; + // Read away data size. + features.getInt(); + for (int i = 0; i < num_feature_blocks; i++) + featureLookup[i] = features.getInt(); + + int num_alignment_blocks = alignments.getInt(); + alignmentLookup = new int[num_alignment_blocks]; + // Read away data size. + alignments.getInt(); + for (int i = 0; i < num_alignment_blocks; i++) + alignmentLookup[i] = alignments.getInt(); + + if (num_alignment_blocks != num_feature_blocks) + throw new RuntimeException("Number of blocks doesn't match up."); + } + + public void traverse() { + traverse(0, ""); + } + + private void traverse(int position, String src_side) { + int num_children = source[position]; + int[] addresses = new int[num_children]; + int[] symbols = new int[num_children]; + int j = position + 1; + for (int i = 0; i < num_children; i++) { + symbols[i] = source[j++]; + addresses[i] = source[j++]; + } + int num_rules = source[j++]; + for (int i = 0; i < num_rules; i++) { + int lhs = source[j++]; + int tgt_address = source[j++]; + int data_address = source[j++]; + printRule(src_side, lhs, tgt_address, data_address); + } + for (int i = 0; i < num_children; i++) { + traverse(addresses[i], src_side + " " + Vocabulary.word(symbols[i])); + } + } + + private String getTarget(int pointer) { + StringBuilder sb = new StringBuilder(); + do { + pointer = target[pointer]; + if (pointer != -1) { + int symbol = target[pointer + 1]; + if (symbol < 0) + sb.append(" ").append("NT" + symbol); + else + sb.append(" ").append(Vocabulary.word(symbol)); + } + } while (pointer != -1); + return sb.toString(); + } + + private String getFeatures(int block_id) { + StringBuilder sb = new StringBuilder(); + + int data_position = featureLookup[block_id]; + int num_features = features.getInt(data_position); + data_position += 4; + for (int i = 0; i < num_features; i++) { + int feature_id = features.getInt(data_position); + Quantizer quantizer = quantization.get(feature_id); + sb.append(" " + Vocabulary.word(feature_id) + "=" + + quantizer.read(features, data_position)); + data_position += 4 + quantizer.size(); + } + return sb.toString(); + } + + private String getAlignments(int block_id) { + StringBuilder sb = new StringBuilder(); + + int data_position = alignmentLookup[block_id]; + byte num_points = alignments.get(data_position); + for (int i = 0; i < num_points; i++) { + byte src = alignments.get(data_position + 1 + 2 * i); + byte tgt = alignments.get(data_position + 2 + 2 * i); + + sb.append(" " + src + "-" + tgt); + } + return sb.toString(); + } + + private void printRule(String src_side, int lhs, int tgt_address, + int data_address) { + System.out.println(Vocabulary.word(lhs) + " |||" + + src_side + " |||" + + getTarget(tgt_address) + " |||" + + getFeatures(data_address) + + (have_alignments ? " |||" + getAlignments(data_address) : "")); + } + + public static void main(String args[]) throws IOException { + PrintRules pr = new PrintRules(args[0]); + pr.traverse(); + } +} diff --git a/test/packed/README b/src/test/java/org/apache/joshua/packed/README similarity index 100% rename from test/packed/README rename to src/test/java/org/apache/joshua/packed/README diff --git a/src/test/java/org/apache/joshua/packed/VocabTest.java b/src/test/java/org/apache/joshua/packed/VocabTest.java new file mode 100644 index 00000000..523df4c9 --- /dev/null +++ b/src/test/java/org/apache/joshua/packed/VocabTest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.packed; + +import java.io.File; +import java.io.IOException; + +import org.apache.joshua.corpus.Vocabulary; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class VocabTest { + + private static final Logger LOG = LoggerFactory.getLogger(VocabTest.class); + + //FIXME: no main() in automated test case, + public static void main(String args[]) { + + int numWords = 0; + try { + String dir = args[0]; + + boolean read = Vocabulary.read(new File(dir + "/vocabulary")); + if (! read) { + System.err.println("VocabTest: Failed to read the vocabulary."); + System.exit(1); + } + + int id = 0; + while (Vocabulary.hasId(id)) { + String word = Vocabulary.word(id); + System.out.println(String.format("VOCAB: %d\t%s", id, word)); + numWords++; + id++; + } + } catch (IOException e) { + LOG.error(e.getMessage(), e); + } + + System.out.println("read " + numWords + " words"); + } +} diff --git a/test/packed/packer.config b/src/test/java/org/apache/joshua/packed/packer.config similarity index 100% rename from test/packed/packer.config rename to src/test/java/org/apache/joshua/packed/packer.config diff --git a/test/packed/small_grammar b/src/test/java/org/apache/joshua/packed/small_grammar similarity index 100% rename from test/packed/small_grammar rename to src/test/java/org/apache/joshua/packed/small_grammar diff --git a/test/packed/test.sh b/src/test/java/org/apache/joshua/packed/test.sh similarity index 100% rename from test/packed/test.sh rename to src/test/java/org/apache/joshua/packed/test.sh diff --git a/tst/joshua/system/AlignmentMapTest.java b/src/test/java/org/apache/joshua/system/AlignmentMapTest.java similarity index 95% rename from tst/joshua/system/AlignmentMapTest.java rename to src/test/java/org/apache/joshua/system/AlignmentMapTest.java index 50c3aff4..eba732a6 100644 --- a/tst/joshua/system/AlignmentMapTest.java +++ b/src/test/java/org/apache/joshua/system/AlignmentMapTest.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ - package joshua.system; + package org.apache.joshua.system; import static org.junit.Assert.*; @@ -25,8 +25,8 @@ import java.util.List; import java.util.Map; -import joshua.corpus.Vocabulary; -import joshua.decoder.ff.tm.Rule; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.ff.tm.Rule; import org.junit.Before; import org.junit.Test; diff --git a/tst/joshua/system/KenLmTest.java b/src/test/java/org/apache/joshua/system/KenLmTest.java similarity index 87% rename from tst/joshua/system/KenLmTest.java rename to src/test/java/org/apache/joshua/system/KenLmTest.java index dba74fcf..6c05a58c 100644 --- a/tst/joshua/system/KenLmTest.java +++ b/src/test/java/org/apache/joshua/system/KenLmTest.java @@ -16,15 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package joshua.system; + package org.apache.joshua.system; -import static joshua.corpus.Vocabulary.registerLanguageModel; -import static joshua.corpus.Vocabulary.unregisterLanguageModels; +import static org.apache.joshua.corpus.Vocabulary.registerLanguageModel; +import static org.apache.joshua.corpus.Vocabulary.unregisterLanguageModels; import static org.junit.Assert.*; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.ff.lm.KenLM; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.ff.lm.KenLM; import org.junit.After; import org.junit.Before; diff --git a/tst/joshua/system/MultithreadedTranslationTests.java b/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java similarity index 76% rename from tst/joshua/system/MultithreadedTranslationTests.java rename to src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java index b257aa65..194be6f3 100644 --- a/tst/joshua/system/MultithreadedTranslationTests.java +++ b/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java @@ -16,21 +16,23 @@ * specific language governing permissions and limitations * under the License. */ - package joshua.system; + package org.apache.joshua.system; import static org.junit.Assert.assertTrue; +import java.io.BufferedReader; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.Translation; -import joshua.decoder.Translations; -import joshua.decoder.io.TranslationRequest; - +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.MetaDataException; +import org.apache.joshua.decoder.io.TranslationRequestStream; +import org.apache.joshua.decoder.segment_file.Sentence; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -108,7 +110,7 @@ public void givenPackedGrammar_whenNTranslationsCalledConcurrently_thenReturnNRe // GIVEN int inputLines = 10000; - joshuaConfig.construct_structured_output = true; // Enabled alignments. + //joshuaConfig.construct_structured_output = true; // Enabled alignments. StringBuilder sb = new StringBuilder(); for (int i = 0; i < inputLines; i++) { sb.append(INPUT + "\n"); @@ -116,19 +118,40 @@ public void givenPackedGrammar_whenNTranslationsCalledConcurrently_thenReturnNRe // Append a large string together to simulate N requests to the decoding // engine. - TranslationRequest req = new TranslationRequest(new ByteArrayInputStream(sb.toString() - .getBytes(Charset.forName("UTF-8"))), joshuaConfig); + TranslationRequestStream req = new TranslationRequestStream( + new BufferedReader(new InputStreamReader(new ByteArrayInputStream(sb.toString() + .getBytes(Charset.forName("UTF-8"))))), joshuaConfig); + + ByteArrayOutputStream output = new ByteArrayOutputStream(); + // WHEN // Translate all spans in parallel. - Translations translations = this.decoder.decodeAll(req); - ArrayList translationResults = new ArrayList(); + try { + this.decoder.decodeAll(req, output); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + ArrayList translationResults = new ArrayList(); final long translationStartTime = System.nanoTime(); - Translation t; - while ((t = translations.next()) != null) { - translationResults.add(t); + Sentence t; + try { + while ((t = req.next()) != null) { + translationResults.add(t); + } + } catch (MetaDataException e) { + e.printStackTrace(); + } finally { + if (output != null) { + try { + output.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } } final long translationEndTime = System.nanoTime(); diff --git a/tst/joshua/system/StructuredOutputTest.java b/src/test/java/org/apache/joshua/system/StructuredOutputTest.java similarity index 83% rename from tst/joshua/system/StructuredOutputTest.java rename to src/test/java/org/apache/joshua/system/StructuredOutputTest.java index 12e6e88a..99d89f95 100644 --- a/tst/joshua/system/StructuredOutputTest.java +++ b/src/test/java/org/apache/joshua/system/StructuredOutputTest.java @@ -16,15 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package joshua.system; +package org.apache.joshua.system; import java.util.Arrays; import java.util.List; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.Translation; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.Translation; +import org.apache.joshua.decoder.segment_file.Sentence; import org.junit.After; import org.junit.Before; @@ -107,14 +107,15 @@ public void test() { joshuaConfig.use_structured_output = true; // set structured output creation to true translation = decode(input); Assert - .assertEquals(expectedTranslation, translation.getTranslationString()); + .assertEquals(expectedTranslation, translation.getStructuredTranslation().getTranslationString()); Assert.assertEquals(Arrays.asList(expectedTranslation.split("\\s+")), - translation.getTranslationTokens()); - Assert.assertEquals(expectedScore, translation.getTranslationScore(), + translation.getStructuredTranslation().getTranslationTokens()); + Assert.assertEquals(expectedScore, translation.getStructuredTranslation().getTranslationScore(), 0.00001); - Assert.assertEquals(expectedWordAlignment, translation.getWordAlignment()); - Assert.assertEquals(translation.getWordAlignment().size(), translation - .getTranslationTokens().size()); + Assert.assertEquals(expectedWordAlignment, translation.getStructuredTranslation() + .getTranslationWordAlignments().get(0)); + Assert.assertEquals(translation.getStructuredTranslation().getTranslationWordAlignments().size(), translation. + getStructuredTranslation().getTranslationTokens().size()); } diff --git a/tst/joshua/system/StructuredTranslationTest.java b/src/test/java/org/apache/joshua/system/StructuredTranslationTest.java similarity index 90% rename from tst/joshua/system/StructuredTranslationTest.java rename to src/test/java/org/apache/joshua/system/StructuredTranslationTest.java index 74606142..1cab6905 100644 --- a/tst/joshua/system/StructuredTranslationTest.java +++ b/src/test/java/org/apache/joshua/system/StructuredTranslationTest.java @@ -16,10 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package joshua.system; +package org.apache.joshua.system; import static java.util.Arrays.asList; -import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES; +import static org.apache.joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -27,13 +27,13 @@ import java.util.List; import java.util.Map; -import joshua.corpus.Vocabulary; -import joshua.decoder.Decoder; -import joshua.decoder.JoshuaConfiguration; -import joshua.decoder.StructuredTranslation; -import joshua.decoder.Translation; -import joshua.decoder.ff.FeatureVector; -import joshua.decoder.segment_file.Sentence; +import org.apache.joshua.corpus.Vocabulary; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.StructuredTranslation; +import org.apache.joshua.decoder.Translation; +import org.apache.joshua.decoder.ff.FeatureVector; +import org.apache.joshua.decoder.segment_file.Sentence; import org.junit.After; import org.junit.Before; @@ -115,7 +115,7 @@ private Translation decode(String input) { @Test public void givenInput_whenRegularOutputFormat_thenExpectedOutput() { // GIVEN - joshuaConfig.construct_structured_output = false; + //joshuaConfig.construct_structured_output = false; joshuaConfig.outputFormat = "%s | %a "; // WHEN @@ -128,7 +128,7 @@ public void givenInput_whenRegularOutputFormat_thenExpectedOutput() { @Test public void givenInput_whenRegularOutputFormatWithTopN1_thenExpectedOutput() { // GIVEN - joshuaConfig.construct_structured_output = false; + //joshuaConfig.construct_structured_output = false; joshuaConfig.outputFormat = "%s | %e | %a | %c"; joshuaConfig.topN = 1; @@ -143,7 +143,7 @@ public void givenInput_whenRegularOutputFormatWithTopN1_thenExpectedOutput() { @Test public void givenInput_whenStructuredOutputFormat_thenExpectedOutput() { // GIVEN - joshuaConfig.construct_structured_output = true; + //joshuaConfig.construct_structured_output = true; // WHEN final StructuredTranslation translation = decode(INPUT).getStructuredTranslation(); @@ -165,7 +165,7 @@ public void givenInput_whenStructuredOutputFormat_thenExpectedOutput() { @Test public void givenEmptyInput_whenStructuredOutputFormat_thenEmptyOutput() { // GIVEN - joshuaConfig.construct_structured_output = true; + //joshuaConfig.construct_structured_output = true; // WHEN final StructuredTranslation translation = decode("").getStructuredTranslation(); @@ -184,7 +184,7 @@ public void givenEmptyInput_whenStructuredOutputFormat_thenEmptyOutput() { @Test public void givenOOVInput_whenStructuredOutputFormat_thenOOVOutput() { // GIVEN - joshuaConfig.construct_structured_output = true; + //joshuaConfig.construct_structured_output = true; final String input = "gabarbl"; // WHEN @@ -204,7 +204,7 @@ public void givenOOVInput_whenStructuredOutputFormat_thenOOVOutput() { @Test public void givenEmptyInput_whenRegularOutputFormat_thenNewlineOutput() { // GIVEN - joshuaConfig.construct_structured_output = false; + //joshuaConfig.construct_structured_output = false; // WHEN final Translation translation = decode(""); diff --git a/src/test/java/org/apache/joshua/ui/tree_visualizer/tree/TreeTest.java b/src/test/java/org/apache/joshua/ui/tree_visualizer/tree/TreeTest.java new file mode 100644 index 00000000..55e8f566 --- /dev/null +++ b/src/test/java/org/apache/joshua/ui/tree_visualizer/tree/TreeTest.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.ui.tree_visualizer.tree; + +import java.util.List; + +import org.testng.Assert; +import org.testng.annotations.Test; + +public class TreeTest { + @Test(expectedExceptions = { IllegalArgumentException.class }) + public void ctor_EmptyString_IllegalArgument() { + Tree tree = new Tree(""); + Assert.assertEquals(tree.size(), 0); + } + + @Test(expectedExceptions = { IllegalArgumentException.class }) + public void ctor_TooFewCloseParens_IllegalArgument() { + Tree tree = new Tree("(A{0-1} foo"); + Assert.assertEquals(tree.size(), 0); + } + + @Test + public void simpleTree_correctSize() { + Tree tree = new Tree("(A{0-1} foo)"); + Assert.assertEquals(tree.size(), 2); + } + + @Test + public void simpleTree_correctRoot() { + Tree tree = new Tree("(A{0-1} foo)"); + Tree.Node root = tree.root(); + Assert.assertEquals(root.label(), "A"); + Assert.assertEquals(root.sourceStartIndex(), 0); + Assert.assertEquals(root.sourceEndIndex(), 1); + Assert.assertEquals(root.children().size(), 1); + } + + @Test + public void simpleTree_correctLeaf() { + Tree tree = new Tree("(A{0-1} foo)"); + Tree.Node leaf = tree.root().children().get(0); + Assert.assertEquals(leaf.label(), "foo"); + Assert.assertEquals(leaf.sourceStartIndex(), -1); + Assert.assertEquals(leaf.sourceEndIndex(), -1); + Assert.assertEquals(leaf.children().size(), 0); + } + + @Test + public void simpleTree_toString() { + Tree tree = new Tree("(A{0-1} foo)"); + Assert.assertEquals(tree.toString(), "(A{0-1} foo)"); + } + + @Test + public void trickyTree_children() { + Tree tree = new Tree("(A{0-2} foo (B{1-2} bar))"); + List children = tree.root().children(); + Assert.assertEquals(children.size(), 2); + Tree.Node foo = children.get(0); + Assert.assertEquals(foo.label(), "foo"); + Assert.assertTrue(foo.isLeaf()); + Assert.assertEquals(foo.sourceStartIndex(), -1); + Assert.assertEquals(foo.sourceEndIndex(), -1); + Tree.Node b = children.get(1); + Assert.assertEquals(b.label(), "B"); + Assert.assertEquals(b.children().size(), 1); + Assert.assertFalse(b.isLeaf()); + Assert.assertEquals(b.sourceStartIndex(), 1); + Assert.assertEquals(b.sourceEndIndex(), 2); + } + + @Test + public void SourceStartComparator() { + Tree tree = new Tree("(A{0-2} foo (B{1-2} bar))"); + Tree.Node a = tree.root(); + Tree.Node b = a.children().get(1); + Tree.NodeSourceStartComparator cmp = new Tree.NodeSourceStartComparator(); + Assert.assertTrue(cmp.compare(a, b) < 0); + } + + @Test + public void SourceStartComparator_LeafSmallerThanAllInternals() { + Tree tree = new Tree("(A{0-2} foo (B{1-2} bar))"); + Tree.Node a = tree.root(); + Tree.Node foo = a.children().get(0); + Tree.Node b = a.children().get(1); + Tree.Node bar = b.children().get(0); + Tree.NodeSourceStartComparator cmp = new Tree.NodeSourceStartComparator(); + Assert.assertTrue(cmp.compare(foo, a) < 0); + Assert.assertTrue(cmp.compare(foo, b) < 0); + Assert.assertTrue(cmp.compare(bar, a) < 0); + Assert.assertTrue(cmp.compare(bar, b) < 0); + } +} diff --git a/test/joshua/util/BitsTest.java b/src/test/java/org/apache/joshua/util/BitsTest.java similarity index 84% rename from test/joshua/util/BitsTest.java rename to src/test/java/org/apache/joshua/util/BitsTest.java index def13f80..50704dc6 100644 --- a/test/joshua/util/BitsTest.java +++ b/src/test/java/org/apache/joshua/util/BitsTest.java @@ -1,21 +1,22 @@ -/* This file is part of the Joshua Machine Translation System. - * - * Joshua is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. + * http://www.apache.org/licenses/LICENSE-2.0 * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free - * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - * MA 02111-1307 USA + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ -package joshua.util; +package org.apache.joshua.util; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/src/test/java/org/apache/joshua/util/CacheTest.java b/src/test/java/org/apache/joshua/util/CacheTest.java new file mode 100644 index 00000000..53b8eb25 --- /dev/null +++ b/src/test/java/org/apache/joshua/util/CacheTest.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.util; + +import org.testng.Assert; +import org.testng.annotations.Test; + +public class CacheTest { + + @Test + public void test() { + + Cache cache = new Cache(5); + + cache.put("a", 1); + cache.put("b", 2); + cache.put("c", 3); + cache.put("d", 4); + cache.put("e", 5); + + Assert.assertTrue(cache.containsKey("a")); + Assert.assertTrue(cache.containsKey("b")); + Assert.assertTrue(cache.containsKey("c")); + Assert.assertTrue(cache.containsKey("d")); + Assert.assertTrue(cache.containsKey("e")); + + // Access the "a" element in the cache + cache.get("a"); + + // Now add a new element that exceeds the capacity of the cache + cache.put("f", 6); + + Assert.assertTrue(cache.containsKey("a")); + + } + +} diff --git a/test/joshua/util/CountsTest.java b/src/test/java/org/apache/joshua/util/CountsTest.java similarity index 68% rename from test/joshua/util/CountsTest.java rename to src/test/java/org/apache/joshua/util/CountsTest.java index 9eb43350..e6a20a49 100644 --- a/test/joshua/util/CountsTest.java +++ b/src/test/java/org/apache/joshua/util/CountsTest.java @@ -1,21 +1,22 @@ -/* This file is part of the Joshua Machine Translation System. - * - * Joshua is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. + * http://www.apache.org/licenses/LICENSE-2.0 * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free - * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - * MA 02111-1307 USA + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ -package joshua.util; +package org.apache.joshua.util; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/tst/joshua/util/FormatUtilsTest.java b/src/test/java/org/apache/joshua/util/FormatUtilsTest.java similarity index 85% rename from tst/joshua/util/FormatUtilsTest.java rename to src/test/java/org/apache/joshua/util/FormatUtilsTest.java index 254522d4..853cf696 100644 --- a/tst/joshua/util/FormatUtilsTest.java +++ b/src/test/java/org/apache/joshua/util/FormatUtilsTest.java @@ -16,14 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package joshua.util; + package org.apache.joshua.util; -import static joshua.util.FormatUtils.cleanNonTerminal; -import static joshua.util.FormatUtils.escapeSpecialSymbols; -import static joshua.util.FormatUtils.isNonterminal; -import static joshua.util.FormatUtils.markup; -import static joshua.util.FormatUtils.stripNonTerminalIndex; -import static joshua.util.FormatUtils.unescapeSpecialSymbols; +import static org.apache.joshua.util.FormatUtils.cleanNonTerminal; +import static org.apache.joshua.util.FormatUtils.escapeSpecialSymbols; +import static org.apache.joshua.util.FormatUtils.isNonterminal; +import static org.apache.joshua.util.FormatUtils.markup; +import static org.apache.joshua.util.FormatUtils.stripNonTerminalIndex; +import static org.apache.joshua.util.FormatUtils.unescapeSpecialSymbols; import static org.junit.Assert.*; import org.junit.Test; diff --git a/src/test/java/org/apache/joshua/util/io/BinaryTest.java b/src/test/java/org/apache/joshua/util/io/BinaryTest.java new file mode 100644 index 00000000..6739b8b7 --- /dev/null +++ b/src/test/java/org/apache/joshua/util/io/BinaryTest.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.util.io; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInput; +import java.io.ObjectOutput; +import java.util.HashSet; +import java.util.Set; + +import org.apache.joshua.corpus.Vocabulary; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class BinaryTest { + + @Test + public void externalizeVocabulary() throws IOException, ClassNotFoundException { + + Set words = new HashSet(); + + for (char c1='a'; c1<='z'; c1++) { + words.add(new String(new char[]{c1})); + for (char c2='a'; c2<='z'; c2++) { + words.add(new String(new char[]{c1,c2})); + } + } + + Vocabulary vocab = new Vocabulary(); + vocab.addAll(words.toArray(new String[words.size()])); + + try { + + File tempFile = File.createTempFile(BinaryTest.class.getName(), "vocab"); + FileOutputStream outputStream = new FileOutputStream(tempFile); + @SuppressWarnings({ "unused", "resource" }) + ObjectOutput out = new BinaryOut(outputStream, true); + vocab.write(tempFile.toString()); + + @SuppressWarnings("resource") + ObjectInput in = new BinaryIn(tempFile.getAbsolutePath(), Vocabulary.class); + Object o = in.readObject(); + Assert.assertTrue(o instanceof Vocabulary); + + Vocabulary newVocab = (Vocabulary) o; + + Assert.assertNotNull(newVocab); + Assert.assertEquals(newVocab.size(), vocab.size()); + + Assert.assertTrue(newVocab.equals(vocab)); + + } catch (SecurityException e) { + Assert.fail("Operating system is unable to create a temp file required by this unit test: " + e); + } + } +} diff --git a/src/test/java/org/apache/joshua/zmert/BLEUTest.java b/src/test/java/org/apache/joshua/zmert/BLEUTest.java new file mode 100644 index 00000000..9423d889 --- /dev/null +++ b/src/test/java/org/apache/joshua/zmert/BLEUTest.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.joshua.zmert; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Scanner; + +import org.apache.joshua.metrics.BLEU; +import org.apache.joshua.metrics.EvaluationMetric; +import org.testng.Assert; +import org.testng.annotations.Parameters; +import org.testng.annotations.Test; + +/** + * Unit tests for BLEU class. + * + * @author Lane Schwartz + * @version $LastChangedDate$ + */ +public class BLEUTest { + + @Test + public void metricName() { + + // Setup the EvaluationMetric class + EvaluationMetric.set_numSentences(0); + EvaluationMetric.set_refsPerSen(1); + EvaluationMetric.set_refSentences(null); + + BLEU bleu = new BLEU(); + + Assert.assertEquals(bleu.get_metricName(), "BLEU"); + + } + + @Test + public void defaultConstructor() { + + // Setup the EvaluationMetric class + EvaluationMetric.set_numSentences(0); + EvaluationMetric.set_refsPerSen(1); + EvaluationMetric.set_refSentences(null); + + BLEU bleu = new BLEU(); + + // Default constructor should use a maximum n-gram length of 4 + Assert.assertEquals(bleu.getMaxGramLength(), 4); + + // Default constructor should use the closest reference + Assert.assertEquals(bleu.getEffLengthMethod(), BLEU.EffectiveLengthMethod.CLOSEST); + + } + + @Test + public void simpleTest() { + + String ref = "this is the fourth chromosome whose sequence has been completed to date . it comprises more than 87 million pairs of dna ."; + String test = "this is the fourth chromosome to be fully sequenced up till now and it comprises of over 87 million pairs of deoxyribonucleic acid ( dna ) ."; + + // refSentences[i][r] stores the r'th reference of the i'th sentence + String[][] refSentences = new String[1][1]; + refSentences[0][0] = ref; + + EvaluationMetric.set_numSentences(1); + EvaluationMetric.set_refsPerSen(1); + EvaluationMetric.set_refSentences(refSentences); + + BLEU bleu = new BLEU(); + + // testSentences[i] stores the candidate translation for the i'th sentence + String[] testSentences = new String[1]; + testSentences[0] = test; + try { + // Check BLEU score matches + double actualScore = bleu.score(testSentences); + double expectedScore = 0.2513; + double acceptableScoreDelta = 0.00001f; + + Assert.assertEquals(actualScore, expectedScore, acceptableScoreDelta); + + // Check sufficient statistics match + int[] actualSS = bleu.suffStats(testSentences); + int[] expectedSS = {14,27,8,26,5,25,3,24,27,23}; + + Assert.assertEquals(actualSS[0], expectedSS[0], 0); // 1-gram matches + Assert.assertEquals(actualSS[1], expectedSS[1], 0); // 1-gram total + Assert.assertEquals(actualSS[2], expectedSS[2], 0); // 2-gram matches + Assert.assertEquals(actualSS[3], expectedSS[3], 0); // 2-gram total + Assert.assertEquals(actualSS[4], expectedSS[4], 0); // 3-gram matches + Assert.assertEquals(actualSS[5], expectedSS[5], 0); // 3-gram total + Assert.assertEquals(actualSS[6], expectedSS[6], 0); // 4-gram matches + Assert.assertEquals(actualSS[7], expectedSS[7], 0); // 4-gram total + Assert.assertEquals(actualSS[8], expectedSS[8], 0); // candidate length + Assert.assertEquals(actualSS[9], expectedSS[9], 0); // reference length + } catch (Exception e) { + Assert.fail(); + } + } + + @Parameters({"referenceFile","testFile"}) + @Test(enabled=false) + public void fileTest(String referenceFile, String testFile) throws FileNotFoundException { + + //TODO You can now read in the files, and do something useful with them. + + @SuppressWarnings("resource") + Scanner refScanner = new Scanner(new File(referenceFile)); + + while (refScanner.hasNextLine()) { + + @SuppressWarnings("unused") + String refLine = refScanner.nextLine(); + } + } +} diff --git a/test/bn-en/hiero/.gitignore b/src/test/resources/bn-en/hiero/.gitignore similarity index 100% rename from test/bn-en/hiero/.gitignore rename to src/test/resources/bn-en/hiero/.gitignore diff --git a/test/bn-en/hiero/class.map b/src/test/resources/bn-en/hiero/class.map similarity index 100% rename from test/bn-en/hiero/class.map rename to src/test/resources/bn-en/hiero/class.map diff --git a/test/bn-en/hiero/class_lm_2gram.gz b/src/test/resources/bn-en/hiero/class_lm_2gram.gz similarity index 100% rename from test/bn-en/hiero/class_lm_2gram.gz rename to src/test/resources/bn-en/hiero/class_lm_2gram.gz diff --git a/test/bn-en/hiero/class_lm_9gram.gz b/src/test/resources/bn-en/hiero/class_lm_9gram.gz similarity index 100% rename from test/bn-en/hiero/class_lm_9gram.gz rename to src/test/resources/bn-en/hiero/class_lm_9gram.gz diff --git a/test/bn-en/hiero/glue-grammar b/src/test/resources/bn-en/hiero/glue-grammar similarity index 100% rename from test/bn-en/hiero/glue-grammar rename to src/test/resources/bn-en/hiero/glue-grammar diff --git a/test/bn-en/hiero/grammar.gz b/src/test/resources/bn-en/hiero/grammar.gz similarity index 100% rename from test/bn-en/hiero/grammar.gz rename to src/test/resources/bn-en/hiero/grammar.gz diff --git a/test/bn-en/hiero/input.bn b/src/test/resources/bn-en/hiero/input.bn similarity index 100% rename from test/bn-en/hiero/input.bn rename to src/test/resources/bn-en/hiero/input.bn diff --git a/test/bn-en/hiero/joshua-berkeleylm.config b/src/test/resources/bn-en/hiero/joshua-berkeleylm.config similarity index 100% rename from test/bn-en/hiero/joshua-berkeleylm.config rename to src/test/resources/bn-en/hiero/joshua-berkeleylm.config diff --git a/test/bn-en/hiero/joshua-classlm.config b/src/test/resources/bn-en/hiero/joshua-classlm.config similarity index 100% rename from test/bn-en/hiero/joshua-classlm.config rename to src/test/resources/bn-en/hiero/joshua-classlm.config diff --git a/test/bn-en/hiero/joshua.config b/src/test/resources/bn-en/hiero/joshua.config similarity index 100% rename from test/bn-en/hiero/joshua.config rename to src/test/resources/bn-en/hiero/joshua.config diff --git a/test/bn-en/hiero/lm.gz b/src/test/resources/bn-en/hiero/lm.gz similarity index 100% rename from test/bn-en/hiero/lm.gz rename to src/test/resources/bn-en/hiero/lm.gz diff --git a/test/bn-en/hiero/output-classlm.gold b/src/test/resources/bn-en/hiero/output-classlm.gold similarity index 100% rename from test/bn-en/hiero/output-classlm.gold rename to src/test/resources/bn-en/hiero/output-classlm.gold diff --git a/test/bn-en/hiero/output.gold b/src/test/resources/bn-en/hiero/output.gold similarity index 100% rename from test/bn-en/hiero/output.gold rename to src/test/resources/bn-en/hiero/output.gold diff --git a/test/bn-en/hiero/output.gold.bleu b/src/test/resources/bn-en/hiero/output.gold.bleu similarity index 100% rename from test/bn-en/hiero/output.gold.bleu rename to src/test/resources/bn-en/hiero/output.gold.bleu diff --git a/test/bn-en/hiero/output.scores.berkeleylm.gold b/src/test/resources/bn-en/hiero/output.scores.berkeleylm.gold similarity index 100% rename from test/bn-en/hiero/output.scores.berkeleylm.gold rename to src/test/resources/bn-en/hiero/output.scores.berkeleylm.gold diff --git a/test/bn-en/hiero/output.scores.gold b/src/test/resources/bn-en/hiero/output.scores.gold similarity index 100% rename from test/bn-en/hiero/output.scores.gold rename to src/test/resources/bn-en/hiero/output.scores.gold diff --git a/test/bn-en/hiero/reference.en.0 b/src/test/resources/bn-en/hiero/reference.en.0 similarity index 100% rename from test/bn-en/hiero/reference.en.0 rename to src/test/resources/bn-en/hiero/reference.en.0 diff --git a/test/bn-en/hiero/reference.en.1 b/src/test/resources/bn-en/hiero/reference.en.1 similarity index 100% rename from test/bn-en/hiero/reference.en.1 rename to src/test/resources/bn-en/hiero/reference.en.1 diff --git a/test/bn-en/hiero/reference.en.2 b/src/test/resources/bn-en/hiero/reference.en.2 similarity index 100% rename from test/bn-en/hiero/reference.en.2 rename to src/test/resources/bn-en/hiero/reference.en.2 diff --git a/test/bn-en/hiero/reference.en.3 b/src/test/resources/bn-en/hiero/reference.en.3 similarity index 100% rename from test/bn-en/hiero/reference.en.3 rename to src/test/resources/bn-en/hiero/reference.en.3 diff --git a/test/bn-en/hiero/test-berkeleylm.sh b/src/test/resources/bn-en/hiero/test-berkeleylm.sh similarity index 100% rename from test/bn-en/hiero/test-berkeleylm.sh rename to src/test/resources/bn-en/hiero/test-berkeleylm.sh diff --git a/test/bn-en/hiero/test-classlm.sh b/src/test/resources/bn-en/hiero/test-classlm.sh similarity index 100% rename from test/bn-en/hiero/test-classlm.sh rename to src/test/resources/bn-en/hiero/test-classlm.sh diff --git a/test/bn-en/hiero/test-filter.sh b/src/test/resources/bn-en/hiero/test-filter.sh similarity index 100% rename from test/bn-en/hiero/test-filter.sh rename to src/test/resources/bn-en/hiero/test-filter.sh diff --git a/test/bn-en/hiero/test.sh b/src/test/resources/bn-en/hiero/test.sh similarity index 100% rename from test/bn-en/hiero/test.sh rename to src/test/resources/bn-en/hiero/test.sh diff --git a/test/bn-en/hiero/topN.pl b/src/test/resources/bn-en/hiero/topN.pl similarity index 100% rename from test/bn-en/hiero/topN.pl rename to src/test/resources/bn-en/hiero/topN.pl diff --git a/test/bn-en/packed/.gitignore b/src/test/resources/bn-en/packed/.gitignore similarity index 100% rename from test/bn-en/packed/.gitignore rename to src/test/resources/bn-en/packed/.gitignore diff --git a/test/bn-en/packed/grammar.glue b/src/test/resources/bn-en/packed/grammar.glue similarity index 100% rename from test/bn-en/packed/grammar.glue rename to src/test/resources/bn-en/packed/grammar.glue diff --git a/test/bn-en/packed/grammar.gz b/src/test/resources/bn-en/packed/grammar.gz similarity index 100% rename from test/bn-en/packed/grammar.gz rename to src/test/resources/bn-en/packed/grammar.gz diff --git a/test/bn-en/packed/grammar.packed/encoding b/src/test/resources/bn-en/packed/grammar.packed/encoding similarity index 100% rename from test/bn-en/packed/grammar.packed/encoding rename to src/test/resources/bn-en/packed/grammar.packed/encoding diff --git a/test/bn-en/packed/grammar.packed/slice_00000.features b/src/test/resources/bn-en/packed/grammar.packed/slice_00000.features similarity index 100% rename from test/bn-en/packed/grammar.packed/slice_00000.features rename to src/test/resources/bn-en/packed/grammar.packed/slice_00000.features diff --git a/test/bn-en/packed/grammar.packed/slice_00000.source b/src/test/resources/bn-en/packed/grammar.packed/slice_00000.source similarity index 100% rename from test/bn-en/packed/grammar.packed/slice_00000.source rename to src/test/resources/bn-en/packed/grammar.packed/slice_00000.source diff --git a/test/bn-en/packed/grammar.packed/slice_00000.target b/src/test/resources/bn-en/packed/grammar.packed/slice_00000.target similarity index 100% rename from test/bn-en/packed/grammar.packed/slice_00000.target rename to src/test/resources/bn-en/packed/grammar.packed/slice_00000.target diff --git a/test/bn-en/packed/grammar.packed/slice_00000.target.lookup b/src/test/resources/bn-en/packed/grammar.packed/slice_00000.target.lookup similarity index 100% rename from test/bn-en/packed/grammar.packed/slice_00000.target.lookup rename to src/test/resources/bn-en/packed/grammar.packed/slice_00000.target.lookup diff --git a/test/bn-en/packed/grammar.packed/vocabulary b/src/test/resources/bn-en/packed/grammar.packed/vocabulary similarity index 100% rename from test/bn-en/packed/grammar.packed/vocabulary rename to src/test/resources/bn-en/packed/grammar.packed/vocabulary diff --git a/test/bn-en/packed/input.bn b/src/test/resources/bn-en/packed/input.bn similarity index 100% rename from test/bn-en/packed/input.bn rename to src/test/resources/bn-en/packed/input.bn diff --git a/test/bn-en/packed/joshua.config b/src/test/resources/bn-en/packed/joshua.config similarity index 100% rename from test/bn-en/packed/joshua.config rename to src/test/resources/bn-en/packed/joshua.config diff --git a/test/bn-en/packed/lm.gz b/src/test/resources/bn-en/packed/lm.gz similarity index 100% rename from test/bn-en/packed/lm.gz rename to src/test/resources/bn-en/packed/lm.gz diff --git a/test/bn-en/packed/output.gold b/src/test/resources/bn-en/packed/output.gold similarity index 100% rename from test/bn-en/packed/output.gold rename to src/test/resources/bn-en/packed/output.gold diff --git a/test/bn-en/packed/output.scores.gold b/src/test/resources/bn-en/packed/output.scores.gold similarity index 100% rename from test/bn-en/packed/output.scores.gold rename to src/test/resources/bn-en/packed/output.scores.gold diff --git a/test/bn-en/packed/reference.en.0 b/src/test/resources/bn-en/packed/reference.en.0 similarity index 100% rename from test/bn-en/packed/reference.en.0 rename to src/test/resources/bn-en/packed/reference.en.0 diff --git a/test/bn-en/packed/reference.en.1 b/src/test/resources/bn-en/packed/reference.en.1 similarity index 100% rename from test/bn-en/packed/reference.en.1 rename to src/test/resources/bn-en/packed/reference.en.1 diff --git a/test/bn-en/packed/reference.en.2 b/src/test/resources/bn-en/packed/reference.en.2 similarity index 100% rename from test/bn-en/packed/reference.en.2 rename to src/test/resources/bn-en/packed/reference.en.2 diff --git a/test/bn-en/packed/reference.en.3 b/src/test/resources/bn-en/packed/reference.en.3 similarity index 100% rename from test/bn-en/packed/reference.en.3 rename to src/test/resources/bn-en/packed/reference.en.3 diff --git a/test/bn-en/packed/reference.en.all b/src/test/resources/bn-en/packed/reference.en.all similarity index 100% rename from test/bn-en/packed/reference.en.all rename to src/test/resources/bn-en/packed/reference.en.all diff --git a/test/bn-en/packed/test.sh b/src/test/resources/bn-en/packed/test.sh similarity index 100% rename from test/bn-en/packed/test.sh rename to src/test/resources/bn-en/packed/test.sh diff --git a/test/bn-en/samt/grammar.glue b/src/test/resources/bn-en/samt/grammar.glue similarity index 100% rename from test/bn-en/samt/grammar.glue rename to src/test/resources/bn-en/samt/grammar.glue diff --git a/test/bn-en/samt/grammar.gz b/src/test/resources/bn-en/samt/grammar.gz similarity index 100% rename from test/bn-en/samt/grammar.gz rename to src/test/resources/bn-en/samt/grammar.gz diff --git a/test/bn-en/samt/input.bn b/src/test/resources/bn-en/samt/input.bn similarity index 100% rename from test/bn-en/samt/input.bn rename to src/test/resources/bn-en/samt/input.bn diff --git a/test/bn-en/samt/joshua.config b/src/test/resources/bn-en/samt/joshua.config similarity index 100% rename from test/bn-en/samt/joshua.config rename to src/test/resources/bn-en/samt/joshua.config diff --git a/test/bn-en/samt/lm.gz b/src/test/resources/bn-en/samt/lm.gz similarity index 100% rename from test/bn-en/samt/lm.gz rename to src/test/resources/bn-en/samt/lm.gz diff --git a/test/bn-en/samt/output.gold b/src/test/resources/bn-en/samt/output.gold similarity index 100% rename from test/bn-en/samt/output.gold rename to src/test/resources/bn-en/samt/output.gold diff --git a/test/bn-en/samt/output.gold.bleu b/src/test/resources/bn-en/samt/output.gold.bleu similarity index 100% rename from test/bn-en/samt/output.gold.bleu rename to src/test/resources/bn-en/samt/output.gold.bleu diff --git a/test/bn-en/samt/output.scores.gold b/src/test/resources/bn-en/samt/output.scores.gold similarity index 100% rename from test/bn-en/samt/output.scores.gold rename to src/test/resources/bn-en/samt/output.scores.gold diff --git a/test/bn-en/samt/reference.en.0 b/src/test/resources/bn-en/samt/reference.en.0 similarity index 100% rename from test/bn-en/samt/reference.en.0 rename to src/test/resources/bn-en/samt/reference.en.0 diff --git a/test/bn-en/samt/reference.en.1 b/src/test/resources/bn-en/samt/reference.en.1 similarity index 100% rename from test/bn-en/samt/reference.en.1 rename to src/test/resources/bn-en/samt/reference.en.1 diff --git a/test/bn-en/samt/reference.en.2 b/src/test/resources/bn-en/samt/reference.en.2 similarity index 100% rename from test/bn-en/samt/reference.en.2 rename to src/test/resources/bn-en/samt/reference.en.2 diff --git a/test/bn-en/samt/reference.en.3 b/src/test/resources/bn-en/samt/reference.en.3 similarity index 100% rename from test/bn-en/samt/reference.en.3 rename to src/test/resources/bn-en/samt/reference.en.3 diff --git a/test/bn-en/samt/test.sh b/src/test/resources/bn-en/samt/test.sh similarity index 100% rename from test/bn-en/samt/test.sh rename to src/test/resources/bn-en/samt/test.sh diff --git a/src/test/resources/data/tiny.en b/src/test/resources/data/tiny.en new file mode 100644 index 00000000..bdb3dc3b --- /dev/null +++ b/src/test/resources/data/tiny.en @@ -0,0 +1,5 @@ +resumption of the session +i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period . +you have requested a debate on this subject in the course of the next few days , during this part-session . +please rise , then , for this minute ' s silence . +( the house rose and observed a minute ' s silence ) diff --git a/test/decoder/constrained/.gitignore b/src/test/resources/decoder/constrained/.gitignore similarity index 100% rename from test/decoder/constrained/.gitignore rename to src/test/resources/decoder/constrained/.gitignore diff --git a/test/decoder/constrained/glue-grammar b/src/test/resources/decoder/constrained/glue-grammar similarity index 100% rename from test/decoder/constrained/glue-grammar rename to src/test/resources/decoder/constrained/glue-grammar diff --git a/test/decoder/constrained/gold.scores b/src/test/resources/decoder/constrained/gold.scores similarity index 100% rename from test/decoder/constrained/gold.scores rename to src/test/resources/decoder/constrained/gold.scores diff --git a/test/decoder/constrained/grammar.gz b/src/test/resources/decoder/constrained/grammar.gz similarity index 100% rename from test/decoder/constrained/grammar.gz rename to src/test/resources/decoder/constrained/grammar.gz diff --git a/test/decoder/constrained/input.bn b/src/test/resources/decoder/constrained/input.bn similarity index 100% rename from test/decoder/constrained/input.bn rename to src/test/resources/decoder/constrained/input.bn diff --git a/test/decoder/constrained/joshua.config b/src/test/resources/decoder/constrained/joshua.config similarity index 100% rename from test/decoder/constrained/joshua.config rename to src/test/resources/decoder/constrained/joshua.config diff --git a/test/decoder/constrained/lm.gz b/src/test/resources/decoder/constrained/lm.gz similarity index 100% rename from test/decoder/constrained/lm.gz rename to src/test/resources/decoder/constrained/lm.gz diff --git a/test/decoder/constrained/output.bleu b/src/test/resources/decoder/constrained/output.bleu similarity index 100% rename from test/decoder/constrained/output.bleu rename to src/test/resources/decoder/constrained/output.bleu diff --git a/test/decoder/constrained/output.gold b/src/test/resources/decoder/constrained/output.gold similarity index 100% rename from test/decoder/constrained/output.gold rename to src/test/resources/decoder/constrained/output.gold diff --git a/test/decoder/constrained/test.sh b/src/test/resources/decoder/constrained/test.sh similarity index 100% rename from test/decoder/constrained/test.sh rename to src/test/resources/decoder/constrained/test.sh diff --git a/test/decoder/constrained/weights b/src/test/resources/decoder/constrained/weights similarity index 100% rename from test/decoder/constrained/weights rename to src/test/resources/decoder/constrained/weights diff --git a/test/decoder/denormalization/input.txt b/src/test/resources/decoder/denormalization/input.txt similarity index 100% rename from test/decoder/denormalization/input.txt rename to src/test/resources/decoder/denormalization/input.txt diff --git a/test/decoder/denormalization/output.expected b/src/test/resources/decoder/denormalization/output.expected similarity index 100% rename from test/decoder/denormalization/output.expected rename to src/test/resources/decoder/denormalization/output.expected diff --git a/test/decoder/denormalization/test.sh b/src/test/resources/decoder/denormalization/test.sh similarity index 100% rename from test/decoder/denormalization/test.sh rename to src/test/resources/decoder/denormalization/test.sh diff --git a/test/decoder/dont-crash/input b/src/test/resources/decoder/dont-crash/input similarity index 100% rename from test/decoder/dont-crash/input rename to src/test/resources/decoder/dont-crash/input diff --git a/test/decoder/dont-crash/output.gold b/src/test/resources/decoder/dont-crash/output.gold similarity index 100% rename from test/decoder/dont-crash/output.gold rename to src/test/resources/decoder/dont-crash/output.gold diff --git a/test/decoder/dont-crash/test.sh b/src/test/resources/decoder/dont-crash/test.sh similarity index 100% rename from test/decoder/dont-crash/test.sh rename to src/test/resources/decoder/dont-crash/test.sh diff --git a/test/decoder/empty-test/.gitignore b/src/test/resources/decoder/empty-test/.gitignore similarity index 100% rename from test/decoder/empty-test/.gitignore rename to src/test/resources/decoder/empty-test/.gitignore diff --git a/test/decoder/empty-test/input b/src/test/resources/decoder/empty-test/input similarity index 100% rename from test/decoder/empty-test/input rename to src/test/resources/decoder/empty-test/input diff --git a/test/decoder/empty-test/output.gold b/src/test/resources/decoder/empty-test/output.gold similarity index 100% rename from test/decoder/empty-test/output.gold rename to src/test/resources/decoder/empty-test/output.gold diff --git a/test/decoder/empty-test/test.sh b/src/test/resources/decoder/empty-test/test.sh similarity index 100% rename from test/decoder/empty-test/test.sh rename to src/test/resources/decoder/empty-test/test.sh diff --git a/test/decoder/fragmentlm/fragments.txt b/src/test/resources/decoder/fragmentlm/fragments.txt similarity index 100% rename from test/decoder/fragmentlm/fragments.txt rename to src/test/resources/decoder/fragmentlm/fragments.txt diff --git a/test/decoder/fragmentlm/glue b/src/test/resources/decoder/fragmentlm/glue similarity index 100% rename from test/decoder/fragmentlm/glue rename to src/test/resources/decoder/fragmentlm/glue diff --git a/test/decoder/fragmentlm/grammar b/src/test/resources/decoder/fragmentlm/grammar similarity index 100% rename from test/decoder/fragmentlm/grammar rename to src/test/resources/decoder/fragmentlm/grammar diff --git a/test/decoder/fragmentlm/input b/src/test/resources/decoder/fragmentlm/input similarity index 100% rename from test/decoder/fragmentlm/input rename to src/test/resources/decoder/fragmentlm/input diff --git a/test/decoder/fragmentlm/joshua.config b/src/test/resources/decoder/fragmentlm/joshua.config similarity index 100% rename from test/decoder/fragmentlm/joshua.config rename to src/test/resources/decoder/fragmentlm/joshua.config diff --git a/test/decoder/fragmentlm/mapping.txt b/src/test/resources/decoder/fragmentlm/mapping.txt similarity index 100% rename from test/decoder/fragmentlm/mapping.txt rename to src/test/resources/decoder/fragmentlm/mapping.txt diff --git a/test/decoder/fragmentlm/test.sh b/src/test/resources/decoder/fragmentlm/test.sh similarity index 100% rename from test/decoder/fragmentlm/test.sh rename to src/test/resources/decoder/fragmentlm/test.sh diff --git a/test/decoder/k-best-extraction/glue-grammar b/src/test/resources/decoder/k-best-extraction/glue-grammar similarity index 100% rename from test/decoder/k-best-extraction/glue-grammar rename to src/test/resources/decoder/k-best-extraction/glue-grammar diff --git a/test/decoder/k-best-extraction/grammar b/src/test/resources/decoder/k-best-extraction/grammar similarity index 100% rename from test/decoder/k-best-extraction/grammar rename to src/test/resources/decoder/k-best-extraction/grammar diff --git a/test/decoder/k-best-extraction/input.txt b/src/test/resources/decoder/k-best-extraction/input.txt similarity index 100% rename from test/decoder/k-best-extraction/input.txt rename to src/test/resources/decoder/k-best-extraction/input.txt diff --git a/test/decoder/k-best-extraction/joshua.config b/src/test/resources/decoder/k-best-extraction/joshua.config similarity index 100% rename from test/decoder/k-best-extraction/joshua.config rename to src/test/resources/decoder/k-best-extraction/joshua.config diff --git a/test/decoder/k-best-extraction/lm.gz b/src/test/resources/decoder/k-best-extraction/lm.gz similarity index 100% rename from test/decoder/k-best-extraction/lm.gz rename to src/test/resources/decoder/k-best-extraction/lm.gz diff --git a/test/decoder/k-best-extraction/output.gold b/src/test/resources/decoder/k-best-extraction/output.gold similarity index 100% rename from test/decoder/k-best-extraction/output.gold rename to src/test/resources/decoder/k-best-extraction/output.gold diff --git a/test/decoder/k-best-extraction/output.scores.gold b/src/test/resources/decoder/k-best-extraction/output.scores.gold similarity index 100% rename from test/decoder/k-best-extraction/output.scores.gold rename to src/test/resources/decoder/k-best-extraction/output.scores.gold diff --git a/test/decoder/k-best-extraction/test.sh b/src/test/resources/decoder/k-best-extraction/test.sh similarity index 100% rename from test/decoder/k-best-extraction/test.sh rename to src/test/resources/decoder/k-best-extraction/test.sh diff --git a/test/decoder/left-state/glue-grammar b/src/test/resources/decoder/left-state/glue-grammar similarity index 100% rename from test/decoder/left-state/glue-grammar rename to src/test/resources/decoder/left-state/glue-grammar diff --git a/test/decoder/left-state/grammar.gz b/src/test/resources/decoder/left-state/grammar.gz similarity index 100% rename from test/decoder/left-state/grammar.gz rename to src/test/resources/decoder/left-state/grammar.gz diff --git a/test/decoder/left-state/input.bn b/src/test/resources/decoder/left-state/input.bn similarity index 100% rename from test/decoder/left-state/input.bn rename to src/test/resources/decoder/left-state/input.bn diff --git a/test/decoder/left-state/joshua.config b/src/test/resources/decoder/left-state/joshua.config similarity index 100% rename from test/decoder/left-state/joshua.config rename to src/test/resources/decoder/left-state/joshua.config diff --git a/test/decoder/left-state/lm.gz b/src/test/resources/decoder/left-state/lm.gz similarity index 100% rename from test/decoder/left-state/lm.gz rename to src/test/resources/decoder/left-state/lm.gz diff --git a/test/decoder/left-state/output.gold b/src/test/resources/decoder/left-state/output.gold similarity index 100% rename from test/decoder/left-state/output.gold rename to src/test/resources/decoder/left-state/output.gold diff --git a/test/decoder/left-state/output.scores.gold b/src/test/resources/decoder/left-state/output.scores.gold similarity index 100% rename from test/decoder/left-state/output.scores.gold rename to src/test/resources/decoder/left-state/output.scores.gold diff --git a/test/decoder/left-state/test.sh b/src/test/resources/decoder/left-state/test.sh similarity index 100% rename from test/decoder/left-state/test.sh rename to src/test/resources/decoder/left-state/test.sh diff --git a/test/decoder/lowercaser/config b/src/test/resources/decoder/lowercaser/config similarity index 100% rename from test/decoder/lowercaser/config rename to src/test/resources/decoder/lowercaser/config diff --git a/test/decoder/lowercaser/grammar.glue b/src/test/resources/decoder/lowercaser/grammar.glue similarity index 100% rename from test/decoder/lowercaser/grammar.glue rename to src/test/resources/decoder/lowercaser/grammar.glue diff --git a/test/decoder/lowercaser/grammar.test b/src/test/resources/decoder/lowercaser/grammar.test similarity index 100% rename from test/decoder/lowercaser/grammar.test rename to src/test/resources/decoder/lowercaser/grammar.test diff --git a/test/decoder/lowercaser/output.gold b/src/test/resources/decoder/lowercaser/output.gold similarity index 100% rename from test/decoder/lowercaser/output.gold rename to src/test/resources/decoder/lowercaser/output.gold diff --git a/test/decoder/lowercaser/test.sh b/src/test/resources/decoder/lowercaser/test.sh similarity index 100% rename from test/decoder/lowercaser/test.sh rename to src/test/resources/decoder/lowercaser/test.sh diff --git a/test/decoder/moses-compat/n-best.txt b/src/test/resources/decoder/moses-compat/n-best.txt similarity index 100% rename from test/decoder/moses-compat/n-best.txt rename to src/test/resources/decoder/moses-compat/n-best.txt diff --git a/test/decoder/moses-compat/output.expected b/src/test/resources/decoder/moses-compat/output.expected similarity index 100% rename from test/decoder/moses-compat/output.expected rename to src/test/resources/decoder/moses-compat/output.expected diff --git a/test/decoder/moses-compat/test.sh b/src/test/resources/decoder/moses-compat/test.sh similarity index 100% rename from test/decoder/moses-compat/test.sh rename to src/test/resources/decoder/moses-compat/test.sh diff --git a/test/decoder/n-ary/glue-grammar b/src/test/resources/decoder/n-ary/glue-grammar similarity index 100% rename from test/decoder/n-ary/glue-grammar rename to src/test/resources/decoder/n-ary/glue-grammar diff --git a/test/decoder/n-ary/gold.scores b/src/test/resources/decoder/n-ary/gold.scores similarity index 100% rename from test/decoder/n-ary/gold.scores rename to src/test/resources/decoder/n-ary/gold.scores diff --git a/test/decoder/n-ary/grammar b/src/test/resources/decoder/n-ary/grammar similarity index 100% rename from test/decoder/n-ary/grammar rename to src/test/resources/decoder/n-ary/grammar diff --git a/test/decoder/n-ary/input.txt b/src/test/resources/decoder/n-ary/input.txt similarity index 100% rename from test/decoder/n-ary/input.txt rename to src/test/resources/decoder/n-ary/input.txt diff --git a/test/decoder/n-ary/joshua.config b/src/test/resources/decoder/n-ary/joshua.config similarity index 100% rename from test/decoder/n-ary/joshua.config rename to src/test/resources/decoder/n-ary/joshua.config diff --git a/test/decoder/n-ary/lm.gz b/src/test/resources/decoder/n-ary/lm.gz similarity index 100% rename from test/decoder/n-ary/lm.gz rename to src/test/resources/decoder/n-ary/lm.gz diff --git a/test/decoder/n-ary/output.bleu b/src/test/resources/decoder/n-ary/output.bleu similarity index 100% rename from test/decoder/n-ary/output.bleu rename to src/test/resources/decoder/n-ary/output.bleu diff --git a/test/decoder/n-ary/output.gold b/src/test/resources/decoder/n-ary/output.gold similarity index 100% rename from test/decoder/n-ary/output.gold rename to src/test/resources/decoder/n-ary/output.gold diff --git a/test/decoder/n-ary/test.sh b/src/test/resources/decoder/n-ary/test.sh similarity index 100% rename from test/decoder/n-ary/test.sh rename to src/test/resources/decoder/n-ary/test.sh diff --git a/test/decoder/n-ary/weights b/src/test/resources/decoder/n-ary/weights similarity index 100% rename from test/decoder/n-ary/weights rename to src/test/resources/decoder/n-ary/weights diff --git a/test/decoder/num_translation_options/README b/src/test/resources/decoder/num_translation_options/README similarity index 100% rename from test/decoder/num_translation_options/README rename to src/test/resources/decoder/num_translation_options/README diff --git a/test/decoder/num_translation_options/glue-grammar b/src/test/resources/decoder/num_translation_options/glue-grammar similarity index 100% rename from test/decoder/num_translation_options/glue-grammar rename to src/test/resources/decoder/num_translation_options/glue-grammar diff --git a/test/decoder/num_translation_options/grammar.gz b/src/test/resources/decoder/num_translation_options/grammar.gz similarity index 100% rename from test/decoder/num_translation_options/grammar.gz rename to src/test/resources/decoder/num_translation_options/grammar.gz diff --git a/test/decoder/num_translation_options/grammar.packed/encoding b/src/test/resources/decoder/num_translation_options/grammar.packed/encoding similarity index 100% rename from test/decoder/num_translation_options/grammar.packed/encoding rename to src/test/resources/decoder/num_translation_options/grammar.packed/encoding diff --git a/test/decoder/num_translation_options/grammar.packed/slice_00000.features b/src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.features similarity index 100% rename from test/decoder/num_translation_options/grammar.packed/slice_00000.features rename to src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.features diff --git a/test/decoder/num_translation_options/grammar.packed/slice_00000.source b/src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.source similarity index 100% rename from test/decoder/num_translation_options/grammar.packed/slice_00000.source rename to src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.source diff --git a/test/decoder/num_translation_options/grammar.packed/slice_00000.target b/src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.target similarity index 100% rename from test/decoder/num_translation_options/grammar.packed/slice_00000.target rename to src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.target diff --git a/test/decoder/num_translation_options/grammar.packed/slice_00000.target.lookup b/src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.target.lookup similarity index 100% rename from test/decoder/num_translation_options/grammar.packed/slice_00000.target.lookup rename to src/test/resources/decoder/num_translation_options/grammar.packed/slice_00000.target.lookup diff --git a/test/decoder/num_translation_options/grammar.packed/vocabulary b/src/test/resources/decoder/num_translation_options/grammar.packed/vocabulary similarity index 100% rename from test/decoder/num_translation_options/grammar.packed/vocabulary rename to src/test/resources/decoder/num_translation_options/grammar.packed/vocabulary diff --git a/test/decoder/num_translation_options/input b/src/test/resources/decoder/num_translation_options/input similarity index 100% rename from test/decoder/num_translation_options/input rename to src/test/resources/decoder/num_translation_options/input diff --git a/test/decoder/num_translation_options/joshua.config b/src/test/resources/decoder/num_translation_options/joshua.config similarity index 100% rename from test/decoder/num_translation_options/joshua.config rename to src/test/resources/decoder/num_translation_options/joshua.config diff --git a/test/decoder/num_translation_options/joshua.config.packed b/src/test/resources/decoder/num_translation_options/joshua.config.packed similarity index 100% rename from test/decoder/num_translation_options/joshua.config.packed rename to src/test/resources/decoder/num_translation_options/joshua.config.packed diff --git a/test/decoder/num_translation_options/lm.gz b/src/test/resources/decoder/num_translation_options/lm.gz similarity index 100% rename from test/decoder/num_translation_options/lm.gz rename to src/test/resources/decoder/num_translation_options/lm.gz diff --git a/test/decoder/num_translation_options/output.gold b/src/test/resources/decoder/num_translation_options/output.gold similarity index 100% rename from test/decoder/num_translation_options/output.gold rename to src/test/resources/decoder/num_translation_options/output.gold diff --git a/test/decoder/num_translation_options/test.sh b/src/test/resources/decoder/num_translation_options/test.sh similarity index 100% rename from test/decoder/num_translation_options/test.sh rename to src/test/resources/decoder/num_translation_options/test.sh diff --git a/test/decoder/oov-list/config b/src/test/resources/decoder/oov-list/config similarity index 100% rename from test/decoder/oov-list/config rename to src/test/resources/decoder/oov-list/config diff --git a/test/decoder/oov-list/glue-grammar b/src/test/resources/decoder/oov-list/glue-grammar similarity index 100% rename from test/decoder/oov-list/glue-grammar rename to src/test/resources/decoder/oov-list/glue-grammar diff --git a/test/decoder/oov-list/grammar b/src/test/resources/decoder/oov-list/grammar similarity index 100% rename from test/decoder/oov-list/grammar rename to src/test/resources/decoder/oov-list/grammar diff --git a/test/decoder/oov-list/input.txt b/src/test/resources/decoder/oov-list/input.txt similarity index 100% rename from test/decoder/oov-list/input.txt rename to src/test/resources/decoder/oov-list/input.txt diff --git a/test/decoder/oov-list/output.gold b/src/test/resources/decoder/oov-list/output.gold similarity index 100% rename from test/decoder/oov-list/output.gold rename to src/test/resources/decoder/oov-list/output.gold diff --git a/test/decoder/oov-list/test.sh b/src/test/resources/decoder/oov-list/test.sh similarity index 100% rename from test/decoder/oov-list/test.sh rename to src/test/resources/decoder/oov-list/test.sh diff --git a/test/decoder/phrase/constrained/config b/src/test/resources/decoder/phrase/constrained/config similarity index 100% rename from test/decoder/phrase/constrained/config rename to src/test/resources/decoder/phrase/constrained/config diff --git a/test/decoder/phrase/constrained/corpus.es b/src/test/resources/decoder/phrase/constrained/corpus.es similarity index 100% rename from test/decoder/phrase/constrained/corpus.es rename to src/test/resources/decoder/phrase/constrained/corpus.es diff --git a/test/decoder/phrase/constrained/glue.grammar b/src/test/resources/decoder/phrase/constrained/glue.grammar similarity index 100% rename from test/decoder/phrase/constrained/glue.grammar rename to src/test/resources/decoder/phrase/constrained/glue.grammar diff --git a/test/decoder/phrase/constrained/output.gold b/src/test/resources/decoder/phrase/constrained/output.gold similarity index 100% rename from test/decoder/phrase/constrained/output.gold rename to src/test/resources/decoder/phrase/constrained/output.gold diff --git a/test/decoder/phrase/constrained/test.sh b/src/test/resources/decoder/phrase/constrained/test.sh similarity index 100% rename from test/decoder/phrase/constrained/test.sh rename to src/test/resources/decoder/phrase/constrained/test.sh diff --git a/test/decoder/phrase/decode/config b/src/test/resources/decoder/phrase/decode/config similarity index 100% rename from test/decoder/phrase/decode/config rename to src/test/resources/decoder/phrase/decode/config diff --git a/test/decoder/phrase/decode/config.packed b/src/test/resources/decoder/phrase/decode/config.packed similarity index 100% rename from test/decoder/phrase/decode/config.packed rename to src/test/resources/decoder/phrase/decode/config.packed diff --git a/test/decoder/phrase/decode/corpus.es b/src/test/resources/decoder/phrase/decode/corpus.es similarity index 100% rename from test/decoder/phrase/decode/corpus.es rename to src/test/resources/decoder/phrase/decode/corpus.es diff --git a/test/decoder/phrase/decode/lm.1.gz b/src/test/resources/decoder/phrase/decode/lm.1.gz similarity index 100% rename from test/decoder/phrase/decode/lm.1.gz rename to src/test/resources/decoder/phrase/decode/lm.1.gz diff --git a/test/decoder/phrase/decode/output.gold b/src/test/resources/decoder/phrase/decode/output.gold similarity index 100% rename from test/decoder/phrase/decode/output.gold rename to src/test/resources/decoder/phrase/decode/output.gold diff --git a/test/decoder/phrase/decode/rules.1.gz b/src/test/resources/decoder/phrase/decode/rules.1.gz similarity index 100% rename from test/decoder/phrase/decode/rules.1.gz rename to src/test/resources/decoder/phrase/decode/rules.1.gz diff --git a/test/decoder/phrase/decode/rules.packed/config b/src/test/resources/decoder/phrase/decode/rules.packed/config similarity index 100% rename from test/decoder/phrase/decode/rules.packed/config rename to src/test/resources/decoder/phrase/decode/rules.packed/config diff --git a/test/decoder/phrase/decode/rules.packed/encoding b/src/test/resources/decoder/phrase/decode/rules.packed/encoding similarity index 100% rename from test/decoder/phrase/decode/rules.packed/encoding rename to src/test/resources/decoder/phrase/decode/rules.packed/encoding diff --git a/test/decoder/phrase/decode/rules.packed/slice_00000.features b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.features similarity index 100% rename from test/decoder/phrase/decode/rules.packed/slice_00000.features rename to src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.features diff --git a/test/decoder/phrase/decode/rules.packed/slice_00000.source b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.source similarity index 100% rename from test/decoder/phrase/decode/rules.packed/slice_00000.source rename to src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.source diff --git a/test/decoder/phrase/decode/rules.packed/slice_00000.target b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target similarity index 100% rename from test/decoder/phrase/decode/rules.packed/slice_00000.target rename to src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target diff --git a/test/decoder/phrase/decode/rules.packed/slice_00000.target.lookup b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target.lookup similarity index 100% rename from test/decoder/phrase/decode/rules.packed/slice_00000.target.lookup rename to src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target.lookup diff --git a/test/decoder/phrase/decode/rules.packed/vocabulary b/src/test/resources/decoder/phrase/decode/rules.packed/vocabulary similarity index 100% rename from test/decoder/phrase/decode/rules.packed/vocabulary rename to src/test/resources/decoder/phrase/decode/rules.packed/vocabulary diff --git a/test/decoder/phrase/decode/test-packed.sh b/src/test/resources/decoder/phrase/decode/test-packed.sh similarity index 100% rename from test/decoder/phrase/decode/test-packed.sh rename to src/test/resources/decoder/phrase/decode/test-packed.sh diff --git a/test/decoder/phrase/decode/test.sh b/src/test/resources/decoder/phrase/decode/test.sh similarity index 100% rename from test/decoder/phrase/decode/test.sh rename to src/test/resources/decoder/phrase/decode/test.sh diff --git a/test/decoder/phrase/include-align-index/README b/src/test/resources/decoder/phrase/include-align-index/README similarity index 100% rename from test/decoder/phrase/include-align-index/README rename to src/test/resources/decoder/phrase/include-align-index/README diff --git a/test/decoder/phrase/include-align-index/config b/src/test/resources/decoder/phrase/include-align-index/config similarity index 100% rename from test/decoder/phrase/include-align-index/config rename to src/test/resources/decoder/phrase/include-align-index/config diff --git a/test/decoder/phrase/include-align-index/corpus.es b/src/test/resources/decoder/phrase/include-align-index/corpus.es similarity index 100% rename from test/decoder/phrase/include-align-index/corpus.es rename to src/test/resources/decoder/phrase/include-align-index/corpus.es diff --git a/test/decoder/phrase/include-align-index/lm.1.gz b/src/test/resources/decoder/phrase/include-align-index/lm.1.gz similarity index 100% rename from test/decoder/phrase/include-align-index/lm.1.gz rename to src/test/resources/decoder/phrase/include-align-index/lm.1.gz diff --git a/test/decoder/phrase/include-align-index/log b/src/test/resources/decoder/phrase/include-align-index/log similarity index 100% rename from test/decoder/phrase/include-align-index/log rename to src/test/resources/decoder/phrase/include-align-index/log diff --git a/test/decoder/phrase/include-align-index/output b/src/test/resources/decoder/phrase/include-align-index/output similarity index 100% rename from test/decoder/phrase/include-align-index/output rename to src/test/resources/decoder/phrase/include-align-index/output diff --git a/test/decoder/phrase/include-align-index/output.gold b/src/test/resources/decoder/phrase/include-align-index/output.gold similarity index 100% rename from test/decoder/phrase/include-align-index/output.gold rename to src/test/resources/decoder/phrase/include-align-index/output.gold diff --git a/test/decoder/phrase/include-align-index/rules.1.gz b/src/test/resources/decoder/phrase/include-align-index/rules.1.gz similarity index 100% rename from test/decoder/phrase/include-align-index/rules.1.gz rename to src/test/resources/decoder/phrase/include-align-index/rules.1.gz diff --git a/test/decoder/phrase/include-align-index/test.sh b/src/test/resources/decoder/phrase/include-align-index/test.sh similarity index 100% rename from test/decoder/phrase/include-align-index/test.sh rename to src/test/resources/decoder/phrase/include-align-index/test.sh diff --git a/test/decoder/phrase/unique-hypotheses/README b/src/test/resources/decoder/phrase/unique-hypotheses/README similarity index 100% rename from test/decoder/phrase/unique-hypotheses/README rename to src/test/resources/decoder/phrase/unique-hypotheses/README diff --git a/test/decoder/phrase/unique-hypotheses/corpus.es b/src/test/resources/decoder/phrase/unique-hypotheses/corpus.es similarity index 100% rename from test/decoder/phrase/unique-hypotheses/corpus.es rename to src/test/resources/decoder/phrase/unique-hypotheses/corpus.es diff --git a/test/decoder/phrase/unique-hypotheses/joshua.config b/src/test/resources/decoder/phrase/unique-hypotheses/joshua.config similarity index 100% rename from test/decoder/phrase/unique-hypotheses/joshua.config rename to src/test/resources/decoder/phrase/unique-hypotheses/joshua.config diff --git a/test/decoder/phrase/unique-hypotheses/lm.1.gz b/src/test/resources/decoder/phrase/unique-hypotheses/lm.1.gz similarity index 100% rename from test/decoder/phrase/unique-hypotheses/lm.1.gz rename to src/test/resources/decoder/phrase/unique-hypotheses/lm.1.gz diff --git a/test/decoder/phrase/unique-hypotheses/output.gold b/src/test/resources/decoder/phrase/unique-hypotheses/output.gold similarity index 100% rename from test/decoder/phrase/unique-hypotheses/output.gold rename to src/test/resources/decoder/phrase/unique-hypotheses/output.gold diff --git a/test/decoder/phrase/unique-hypotheses/rules.1.gz b/src/test/resources/decoder/phrase/unique-hypotheses/rules.1.gz similarity index 100% rename from test/decoder/phrase/unique-hypotheses/rules.1.gz rename to src/test/resources/decoder/phrase/unique-hypotheses/rules.1.gz diff --git a/test/decoder/phrase/unique-hypotheses/test.sh b/src/test/resources/decoder/phrase/unique-hypotheses/test.sh similarity index 100% rename from test/decoder/phrase/unique-hypotheses/test.sh rename to src/test/resources/decoder/phrase/unique-hypotheses/test.sh diff --git a/test/decoder/regexp-grammar-both-rule-types/.gitignore b/src/test/resources/decoder/regexp-grammar-both-rule-types/.gitignore similarity index 100% rename from test/decoder/regexp-grammar-both-rule-types/.gitignore rename to src/test/resources/decoder/regexp-grammar-both-rule-types/.gitignore diff --git a/test/decoder/regexp-grammar-both-rule-types/README b/src/test/resources/decoder/regexp-grammar-both-rule-types/README similarity index 100% rename from test/decoder/regexp-grammar-both-rule-types/README rename to src/test/resources/decoder/regexp-grammar-both-rule-types/README diff --git a/test/decoder/regexp-grammar-both-rule-types/config b/src/test/resources/decoder/regexp-grammar-both-rule-types/config similarity index 100% rename from test/decoder/regexp-grammar-both-rule-types/config rename to src/test/resources/decoder/regexp-grammar-both-rule-types/config diff --git a/test/decoder/regexp-grammar-both-rule-types/glue-grammar b/src/test/resources/decoder/regexp-grammar-both-rule-types/glue-grammar similarity index 100% rename from test/decoder/regexp-grammar-both-rule-types/glue-grammar rename to src/test/resources/decoder/regexp-grammar-both-rule-types/glue-grammar diff --git a/test/decoder/regexp-grammar-both-rule-types/input b/src/test/resources/decoder/regexp-grammar-both-rule-types/input similarity index 100% rename from test/decoder/regexp-grammar-both-rule-types/input rename to src/test/resources/decoder/regexp-grammar-both-rule-types/input diff --git a/test/decoder/regexp-grammar-both-rule-types/output.gold b/src/test/resources/decoder/regexp-grammar-both-rule-types/output.gold similarity index 100% rename from test/decoder/regexp-grammar-both-rule-types/output.gold rename to src/test/resources/decoder/regexp-grammar-both-rule-types/output.gold diff --git a/test/decoder/regexp-grammar-both-rule-types/regexp-grammar b/src/test/resources/decoder/regexp-grammar-both-rule-types/regexp-grammar similarity index 100% rename from test/decoder/regexp-grammar-both-rule-types/regexp-grammar rename to src/test/resources/decoder/regexp-grammar-both-rule-types/regexp-grammar diff --git a/test/decoder/regexp-grammar-both-rule-types/test.sh b/src/test/resources/decoder/regexp-grammar-both-rule-types/test.sh similarity index 100% rename from test/decoder/regexp-grammar-both-rule-types/test.sh rename to src/test/resources/decoder/regexp-grammar-both-rule-types/test.sh diff --git a/test/decoder/regexp-grammar-both-rule-types/weights b/src/test/resources/decoder/regexp-grammar-both-rule-types/weights similarity index 100% rename from test/decoder/regexp-grammar-both-rule-types/weights rename to src/test/resources/decoder/regexp-grammar-both-rule-types/weights diff --git a/test/decoder/regexp-grammar/.gitignore b/src/test/resources/decoder/regexp-grammar/.gitignore similarity index 100% rename from test/decoder/regexp-grammar/.gitignore rename to src/test/resources/decoder/regexp-grammar/.gitignore diff --git a/test/decoder/regexp-grammar/README b/src/test/resources/decoder/regexp-grammar/README similarity index 100% rename from test/decoder/regexp-grammar/README rename to src/test/resources/decoder/regexp-grammar/README diff --git a/test/decoder/regexp-grammar/config b/src/test/resources/decoder/regexp-grammar/config similarity index 100% rename from test/decoder/regexp-grammar/config rename to src/test/resources/decoder/regexp-grammar/config diff --git a/test/decoder/regexp-grammar/glue-grammar b/src/test/resources/decoder/regexp-grammar/glue-grammar similarity index 100% rename from test/decoder/regexp-grammar/glue-grammar rename to src/test/resources/decoder/regexp-grammar/glue-grammar diff --git a/test/decoder/regexp-grammar/input b/src/test/resources/decoder/regexp-grammar/input similarity index 100% rename from test/decoder/regexp-grammar/input rename to src/test/resources/decoder/regexp-grammar/input diff --git a/test/decoder/regexp-grammar/output.gold b/src/test/resources/decoder/regexp-grammar/output.gold similarity index 100% rename from test/decoder/regexp-grammar/output.gold rename to src/test/resources/decoder/regexp-grammar/output.gold diff --git a/test/decoder/regexp-grammar/regexp-grammar b/src/test/resources/decoder/regexp-grammar/regexp-grammar similarity index 100% rename from test/decoder/regexp-grammar/regexp-grammar rename to src/test/resources/decoder/regexp-grammar/regexp-grammar diff --git a/test/decoder/regexp-grammar/test.sh b/src/test/resources/decoder/regexp-grammar/test.sh similarity index 100% rename from test/decoder/regexp-grammar/test.sh rename to src/test/resources/decoder/regexp-grammar/test.sh diff --git a/test/decoder/regexp-grammar/weights b/src/test/resources/decoder/regexp-grammar/weights similarity index 100% rename from test/decoder/regexp-grammar/weights rename to src/test/resources/decoder/regexp-grammar/weights diff --git a/test/decoder/rescoring/glue-grammar b/src/test/resources/decoder/rescoring/glue-grammar similarity index 100% rename from test/decoder/rescoring/glue-grammar rename to src/test/resources/decoder/rescoring/glue-grammar diff --git a/test/decoder/rescoring/grammar.gz b/src/test/resources/decoder/rescoring/grammar.gz similarity index 100% rename from test/decoder/rescoring/grammar.gz rename to src/test/resources/decoder/rescoring/grammar.gz diff --git a/test/decoder/rescoring/input.txt b/src/test/resources/decoder/rescoring/input.txt similarity index 100% rename from test/decoder/rescoring/input.txt rename to src/test/resources/decoder/rescoring/input.txt diff --git a/test/decoder/rescoring/joshua.config b/src/test/resources/decoder/rescoring/joshua.config similarity index 100% rename from test/decoder/rescoring/joshua.config rename to src/test/resources/decoder/rescoring/joshua.config diff --git a/test/decoder/rescoring/output.gold b/src/test/resources/decoder/rescoring/output.gold similarity index 100% rename from test/decoder/rescoring/output.gold rename to src/test/resources/decoder/rescoring/output.gold diff --git a/test/decoder/rescoring/test.sh b/src/test/resources/decoder/rescoring/test.sh similarity index 100% rename from test/decoder/rescoring/test.sh rename to src/test/resources/decoder/rescoring/test.sh diff --git a/test/decoder/segment-oovs/config b/src/test/resources/decoder/segment-oovs/config similarity index 100% rename from test/decoder/segment-oovs/config rename to src/test/resources/decoder/segment-oovs/config diff --git a/test/decoder/segment-oovs/input.txt b/src/test/resources/decoder/segment-oovs/input.txt similarity index 100% rename from test/decoder/segment-oovs/input.txt rename to src/test/resources/decoder/segment-oovs/input.txt diff --git a/test/decoder/segment-oovs/output.expected b/src/test/resources/decoder/segment-oovs/output.expected similarity index 100% rename from test/decoder/segment-oovs/output.expected rename to src/test/resources/decoder/segment-oovs/output.expected diff --git a/test/decoder/segment-oovs/test.sh b/src/test/resources/decoder/segment-oovs/test.sh similarity index 100% rename from test/decoder/segment-oovs/test.sh rename to src/test/resources/decoder/segment-oovs/test.sh diff --git a/test/decoder/source-annotations/grammar b/src/test/resources/decoder/source-annotations/grammar similarity index 100% rename from test/decoder/source-annotations/grammar rename to src/test/resources/decoder/source-annotations/grammar diff --git a/test/decoder/source-annotations/grammar.glue b/src/test/resources/decoder/source-annotations/grammar.glue similarity index 100% rename from test/decoder/source-annotations/grammar.glue rename to src/test/resources/decoder/source-annotations/grammar.glue diff --git a/test/decoder/source-annotations/input.txt b/src/test/resources/decoder/source-annotations/input.txt similarity index 100% rename from test/decoder/source-annotations/input.txt rename to src/test/resources/decoder/source-annotations/input.txt diff --git a/test/decoder/source-annotations/joshua.config b/src/test/resources/decoder/source-annotations/joshua.config similarity index 100% rename from test/decoder/source-annotations/joshua.config rename to src/test/resources/decoder/source-annotations/joshua.config diff --git a/test/decoder/source-annotations/lm.kenlm b/src/test/resources/decoder/source-annotations/lm.kenlm similarity index 100% rename from test/decoder/source-annotations/lm.kenlm rename to src/test/resources/decoder/source-annotations/lm.kenlm diff --git a/test/decoder/source-annotations/output.gold b/src/test/resources/decoder/source-annotations/output.gold similarity index 100% rename from test/decoder/source-annotations/output.gold rename to src/test/resources/decoder/source-annotations/output.gold diff --git a/test/decoder/source-annotations/test.sh b/src/test/resources/decoder/source-annotations/test.sh similarity index 100% rename from test/decoder/source-annotations/test.sh rename to src/test/resources/decoder/source-annotations/test.sh diff --git a/test/decoder/target-bigram/out.gold b/src/test/resources/decoder/target-bigram/out.gold similarity index 100% rename from test/decoder/target-bigram/out.gold rename to src/test/resources/decoder/target-bigram/out.gold diff --git a/test/decoder/target-bigram/test.sh b/src/test/resources/decoder/target-bigram/test.sh similarity index 100% rename from test/decoder/target-bigram/test.sh rename to src/test/resources/decoder/target-bigram/test.sh diff --git a/test/decoder/target-bigram/vocab b/src/test/resources/decoder/target-bigram/vocab similarity index 100% rename from test/decoder/target-bigram/vocab rename to src/test/resources/decoder/target-bigram/vocab diff --git a/test/decoder/too-long/output.gold b/src/test/resources/decoder/too-long/output.gold similarity index 100% rename from test/decoder/too-long/output.gold rename to src/test/resources/decoder/too-long/output.gold diff --git a/test/decoder/too-long/test.sh b/src/test/resources/decoder/too-long/test.sh similarity index 100% rename from test/decoder/too-long/test.sh rename to src/test/resources/decoder/too-long/test.sh diff --git a/test/decoder/tree-output/fragment-map.txt b/src/test/resources/decoder/tree-output/fragment-map.txt similarity index 100% rename from test/decoder/tree-output/fragment-map.txt rename to src/test/resources/decoder/tree-output/fragment-map.txt diff --git a/test/decoder/tree-output/glue-grammar b/src/test/resources/decoder/tree-output/glue-grammar similarity index 100% rename from test/decoder/tree-output/glue-grammar rename to src/test/resources/decoder/tree-output/glue-grammar diff --git a/test/decoder/tree-output/grammar.gz b/src/test/resources/decoder/tree-output/grammar.gz similarity index 100% rename from test/decoder/tree-output/grammar.gz rename to src/test/resources/decoder/tree-output/grammar.gz diff --git a/test/decoder/tree-output/input b/src/test/resources/decoder/tree-output/input similarity index 100% rename from test/decoder/tree-output/input rename to src/test/resources/decoder/tree-output/input diff --git a/test/decoder/tree-output/joshua.config b/src/test/resources/decoder/tree-output/joshua.config similarity index 100% rename from test/decoder/tree-output/joshua.config rename to src/test/resources/decoder/tree-output/joshua.config diff --git a/test/decoder/tree-output/lm.gz b/src/test/resources/decoder/tree-output/lm.gz similarity index 100% rename from test/decoder/tree-output/lm.gz rename to src/test/resources/decoder/tree-output/lm.gz diff --git a/test/decoder/tree-output/output.gold b/src/test/resources/decoder/tree-output/output.gold similarity index 100% rename from test/decoder/tree-output/output.gold rename to src/test/resources/decoder/tree-output/output.gold diff --git a/test/decoder/tree-output/test.sh b/src/test/resources/decoder/tree-output/test.sh similarity index 100% rename from test/decoder/tree-output/test.sh rename to src/test/resources/decoder/tree-output/test.sh diff --git a/test/grammar/sparse-features/grammar b/src/test/resources/grammar/sparse-features/grammar similarity index 100% rename from test/grammar/sparse-features/grammar rename to src/test/resources/grammar/sparse-features/grammar diff --git a/test/grammar/sparse-features/grammar.glue b/src/test/resources/grammar/sparse-features/grammar.glue similarity index 100% rename from test/grammar/sparse-features/grammar.glue rename to src/test/resources/grammar/sparse-features/grammar.glue diff --git a/test/grammar/sparse-features/grammar.packed/encoding b/src/test/resources/grammar/sparse-features/grammar.packed/encoding similarity index 100% rename from test/grammar/sparse-features/grammar.packed/encoding rename to src/test/resources/grammar/sparse-features/grammar.packed/encoding diff --git a/test/grammar/sparse-features/grammar.packed/slice_00000.features b/src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.features similarity index 100% rename from test/grammar/sparse-features/grammar.packed/slice_00000.features rename to src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.features diff --git a/test/grammar/sparse-features/grammar.packed/slice_00000.source b/src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.source similarity index 100% rename from test/grammar/sparse-features/grammar.packed/slice_00000.source rename to src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.source diff --git a/test/grammar/sparse-features/grammar.packed/slice_00000.target b/src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.target similarity index 100% rename from test/grammar/sparse-features/grammar.packed/slice_00000.target rename to src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.target diff --git a/test/grammar/sparse-features/grammar.packed/slice_00000.target.lookup b/src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.target.lookup similarity index 100% rename from test/grammar/sparse-features/grammar.packed/slice_00000.target.lookup rename to src/test/resources/grammar/sparse-features/grammar.packed/slice_00000.target.lookup diff --git a/test/grammar/sparse-features/grammar.packed/vocabulary b/src/test/resources/grammar/sparse-features/grammar.packed/vocabulary similarity index 100% rename from test/grammar/sparse-features/grammar.packed/vocabulary rename to src/test/resources/grammar/sparse-features/grammar.packed/vocabulary diff --git a/test/grammar/sparse-features/joshua-packed.config b/src/test/resources/grammar/sparse-features/joshua-packed.config similarity index 100% rename from test/grammar/sparse-features/joshua-packed.config rename to src/test/resources/grammar/sparse-features/joshua-packed.config diff --git a/test/grammar/sparse-features/joshua.config b/src/test/resources/grammar/sparse-features/joshua.config similarity index 100% rename from test/grammar/sparse-features/joshua.config rename to src/test/resources/grammar/sparse-features/joshua.config diff --git a/test/grammar/sparse-features/output.gold b/src/test/resources/grammar/sparse-features/output.gold similarity index 100% rename from test/grammar/sparse-features/output.gold rename to src/test/resources/grammar/sparse-features/output.gold diff --git a/test/grammar/sparse-features/test-packed.sh b/src/test/resources/grammar/sparse-features/test-packed.sh similarity index 100% rename from test/grammar/sparse-features/test-packed.sh rename to src/test/resources/grammar/sparse-features/test-packed.sh diff --git a/test/grammar/sparse-features/test.sh b/src/test/resources/grammar/sparse-features/test.sh similarity index 100% rename from test/grammar/sparse-features/test.sh rename to src/test/resources/grammar/sparse-features/test.sh diff --git a/test/joshua/README.broken b/src/test/resources/joshua/README.broken similarity index 100% rename from test/joshua/README.broken rename to src/test/resources/joshua/README.broken diff --git a/test/lattice-short/README b/src/test/resources/lattice-short/README similarity index 100% rename from test/lattice-short/README rename to src/test/resources/lattice-short/README diff --git a/test/lattice-short/glue-grammar b/src/test/resources/lattice-short/glue-grammar similarity index 100% rename from test/lattice-short/glue-grammar rename to src/test/resources/lattice-short/glue-grammar diff --git a/test/lattice-short/grammar.test b/src/test/resources/lattice-short/grammar.test similarity index 100% rename from test/lattice-short/grammar.test rename to src/test/resources/lattice-short/grammar.test diff --git a/test/lattice-short/input b/src/test/resources/lattice-short/input similarity index 100% rename from test/lattice-short/input rename to src/test/resources/lattice-short/input diff --git a/test/lattice-short/joshua.config b/src/test/resources/lattice-short/joshua.config similarity index 100% rename from test/lattice-short/joshua.config rename to src/test/resources/lattice-short/joshua.config diff --git a/test/lattice-short/output.expected b/src/test/resources/lattice-short/output.expected similarity index 100% rename from test/lattice-short/output.expected rename to src/test/resources/lattice-short/output.expected diff --git a/test/lattice-short/test.lm b/src/test/resources/lattice-short/test.lm similarity index 100% rename from test/lattice-short/test.lm rename to src/test/resources/lattice-short/test.lm diff --git a/test/lattice-short/test.sh b/src/test/resources/lattice-short/test.sh similarity index 100% rename from test/lattice-short/test.sh rename to src/test/resources/lattice-short/test.sh diff --git a/test/lattice/.gitignore b/src/test/resources/lattice/.gitignore similarity index 100% rename from test/lattice/.gitignore rename to src/test/resources/lattice/.gitignore diff --git a/test/lattice/README b/src/test/resources/lattice/README similarity index 100% rename from test/lattice/README rename to src/test/resources/lattice/README diff --git a/test/lattice/glue-grammar b/src/test/resources/lattice/glue-grammar similarity index 100% rename from test/lattice/glue-grammar rename to src/test/resources/lattice/glue-grammar diff --git a/test/lattice/grammar.test b/src/test/resources/lattice/grammar.test similarity index 100% rename from test/lattice/grammar.test rename to src/test/resources/lattice/grammar.test diff --git a/test/lattice/joshua.config b/src/test/resources/lattice/joshua.config similarity index 100% rename from test/lattice/joshua.config rename to src/test/resources/lattice/joshua.config diff --git a/test/lattice/output.expected b/src/test/resources/lattice/output.expected similarity index 100% rename from test/lattice/output.expected rename to src/test/resources/lattice/output.expected diff --git a/test/lattice/test-lattice.pdf b/src/test/resources/lattice/test-lattice.pdf similarity index 100% rename from test/lattice/test-lattice.pdf rename to src/test/resources/lattice/test-lattice.pdf diff --git a/test/lattice/test.lm b/src/test/resources/lattice/test.lm similarity index 100% rename from test/lattice/test.lm rename to src/test/resources/lattice/test.lm diff --git a/test/lattice/test.plf b/src/test/resources/lattice/test.plf similarity index 100% rename from test/lattice/test.plf rename to src/test/resources/lattice/test.plf diff --git a/test/lattice/test.sh b/src/test/resources/lattice/test.sh similarity index 100% rename from test/lattice/test.sh rename to src/test/resources/lattice/test.sh diff --git a/test/lm/berkeley/lm b/src/test/resources/lm/berkeley/lm similarity index 100% rename from test/lm/berkeley/lm rename to src/test/resources/lm/berkeley/lm diff --git a/test/lm/berkeley/lm.berkeleylm b/src/test/resources/lm/berkeley/lm.berkeleylm similarity index 100% rename from test/lm/berkeley/lm.berkeleylm rename to src/test/resources/lm/berkeley/lm.berkeleylm diff --git a/test/lm/berkeley/lm.berkeleylm.gz b/src/test/resources/lm/berkeley/lm.berkeleylm.gz similarity index 100% rename from test/lm/berkeley/lm.berkeleylm.gz rename to src/test/resources/lm/berkeley/lm.berkeleylm.gz diff --git a/test/lm/berkeley/lm.gz b/src/test/resources/lm/berkeley/lm.gz similarity index 100% rename from test/lm/berkeley/lm.gz rename to src/test/resources/lm/berkeley/lm.gz diff --git a/test/lm/berkeley/output.gold b/src/test/resources/lm/berkeley/output.gold similarity index 100% rename from test/lm/berkeley/output.gold rename to src/test/resources/lm/berkeley/output.gold diff --git a/test/lm/berkeley/test.sh b/src/test/resources/lm/berkeley/test.sh similarity index 100% rename from test/lm/berkeley/test.sh rename to src/test/resources/lm/berkeley/test.sh diff --git a/test/packed-grammar/.gitignore b/src/test/resources/packed-grammar/.gitignore similarity index 100% rename from test/packed-grammar/.gitignore rename to src/test/resources/packed-grammar/.gitignore diff --git a/test/packed-grammar/README b/src/test/resources/packed-grammar/README similarity index 100% rename from test/packed-grammar/README rename to src/test/resources/packed-grammar/README diff --git a/test/packed-grammar/grammar.gz b/src/test/resources/packed-grammar/grammar.gz similarity index 100% rename from test/packed-grammar/grammar.gz rename to src/test/resources/packed-grammar/grammar.gz diff --git a/test/packed-grammar/input.bn b/src/test/resources/packed-grammar/input.bn similarity index 100% rename from test/packed-grammar/input.bn rename to src/test/resources/packed-grammar/input.bn diff --git a/test/packed-grammar/joshua.config b/src/test/resources/packed-grammar/joshua.config similarity index 100% rename from test/packed-grammar/joshua.config rename to src/test/resources/packed-grammar/joshua.config diff --git a/test/packed-grammar/lm.gz b/src/test/resources/packed-grammar/lm.gz similarity index 100% rename from test/packed-grammar/lm.gz rename to src/test/resources/packed-grammar/lm.gz diff --git a/test/packed-grammar/output.gold b/src/test/resources/packed-grammar/output.gold similarity index 100% rename from test/packed-grammar/output.gold rename to src/test/resources/packed-grammar/output.gold diff --git a/test/packed-grammar/reference.en.0 b/src/test/resources/packed-grammar/reference.en.0 similarity index 100% rename from test/packed-grammar/reference.en.0 rename to src/test/resources/packed-grammar/reference.en.0 diff --git a/test/packed-grammar/reference.en.1 b/src/test/resources/packed-grammar/reference.en.1 similarity index 100% rename from test/packed-grammar/reference.en.1 rename to src/test/resources/packed-grammar/reference.en.1 diff --git a/test/packed-grammar/reference.en.2 b/src/test/resources/packed-grammar/reference.en.2 similarity index 100% rename from test/packed-grammar/reference.en.2 rename to src/test/resources/packed-grammar/reference.en.2 diff --git a/test/packed-grammar/reference.en.3 b/src/test/resources/packed-grammar/reference.en.3 similarity index 100% rename from test/packed-grammar/reference.en.3 rename to src/test/resources/packed-grammar/reference.en.3 diff --git a/test/packed-grammar/test-multiple.sh b/src/test/resources/packed-grammar/test-multiple.sh similarity index 100% rename from test/packed-grammar/test-multiple.sh rename to src/test/resources/packed-grammar/test-multiple.sh diff --git a/test/packed-grammar/test.sh b/src/test/resources/packed-grammar/test.sh similarity index 100% rename from test/packed-grammar/test.sh rename to src/test/resources/packed-grammar/test.sh diff --git a/test/parser/grammar b/src/test/resources/parser/grammar similarity index 100% rename from test/parser/grammar rename to src/test/resources/parser/grammar diff --git a/test/parser/grammar.glue b/src/test/resources/parser/grammar.glue similarity index 100% rename from test/parser/grammar.glue rename to src/test/resources/parser/grammar.glue diff --git a/test/parser/input b/src/test/resources/parser/input similarity index 100% rename from test/parser/input rename to src/test/resources/parser/input diff --git a/test/parser/output.gold b/src/test/resources/parser/output.gold similarity index 100% rename from test/parser/output.gold rename to src/test/resources/parser/output.gold diff --git a/test/parser/parse.config b/src/test/resources/parser/parse.config similarity index 100% rename from test/parser/parse.config rename to src/test/resources/parser/parse.config diff --git a/test/parser/test.sh b/src/test/resources/parser/test.sh similarity index 100% rename from test/parser/test.sh rename to src/test/resources/parser/test.sh diff --git a/test/parser/weights b/src/test/resources/parser/weights similarity index 100% rename from test/parser/weights rename to src/test/resources/parser/weights diff --git a/test/pipeline/.gitignore b/src/test/resources/pipeline/.gitignore similarity index 100% rename from test/pipeline/.gitignore rename to src/test/resources/pipeline/.gitignore diff --git a/test/pipeline/Makefile b/src/test/resources/pipeline/Makefile similarity index 100% rename from test/pipeline/Makefile rename to src/test/resources/pipeline/Makefile diff --git a/test/pipeline/final-bleu.gold b/src/test/resources/pipeline/final-bleu.gold similarity index 100% rename from test/pipeline/final-bleu.gold rename to src/test/resources/pipeline/final-bleu.gold diff --git a/test/pipeline/input/devtest.en.0 b/src/test/resources/pipeline/input/devtest.en.0 similarity index 100% rename from test/pipeline/input/devtest.en.0 rename to src/test/resources/pipeline/input/devtest.en.0 diff --git a/test/pipeline/input/devtest.en.1 b/src/test/resources/pipeline/input/devtest.en.1 similarity index 100% rename from test/pipeline/input/devtest.en.1 rename to src/test/resources/pipeline/input/devtest.en.1 diff --git a/test/pipeline/input/devtest.en.2 b/src/test/resources/pipeline/input/devtest.en.2 similarity index 100% rename from test/pipeline/input/devtest.en.2 rename to src/test/resources/pipeline/input/devtest.en.2 diff --git a/test/pipeline/input/devtest.en.3 b/src/test/resources/pipeline/input/devtest.en.3 similarity index 100% rename from test/pipeline/input/devtest.en.3 rename to src/test/resources/pipeline/input/devtest.en.3 diff --git a/test/pipeline/input/devtest.ur b/src/test/resources/pipeline/input/devtest.ur similarity index 100% rename from test/pipeline/input/devtest.ur rename to src/test/resources/pipeline/input/devtest.ur diff --git a/test/pipeline/input/train.en b/src/test/resources/pipeline/input/train.en similarity index 100% rename from test/pipeline/input/train.en rename to src/test/resources/pipeline/input/train.en diff --git a/test/pipeline/input/train.ur b/src/test/resources/pipeline/input/train.ur similarity index 100% rename from test/pipeline/input/train.ur rename to src/test/resources/pipeline/input/train.ur diff --git a/test/pipeline/input/tune.en.0 b/src/test/resources/pipeline/input/tune.en.0 similarity index 100% rename from test/pipeline/input/tune.en.0 rename to src/test/resources/pipeline/input/tune.en.0 diff --git a/test/pipeline/input/tune.en.1 b/src/test/resources/pipeline/input/tune.en.1 similarity index 100% rename from test/pipeline/input/tune.en.1 rename to src/test/resources/pipeline/input/tune.en.1 diff --git a/test/pipeline/input/tune.en.2 b/src/test/resources/pipeline/input/tune.en.2 similarity index 100% rename from test/pipeline/input/tune.en.2 rename to src/test/resources/pipeline/input/tune.en.2 diff --git a/test/pipeline/input/tune.en.3 b/src/test/resources/pipeline/input/tune.en.3 similarity index 100% rename from test/pipeline/input/tune.en.3 rename to src/test/resources/pipeline/input/tune.en.3 diff --git a/test/pipeline/input/tune.ur b/src/test/resources/pipeline/input/tune.ur similarity index 100% rename from test/pipeline/input/tune.ur rename to src/test/resources/pipeline/input/tune.ur diff --git a/test/pipeline/test-ghkm.sh b/src/test/resources/pipeline/test-ghkm.sh similarity index 100% rename from test/pipeline/test-ghkm.sh rename to src/test/resources/pipeline/test-ghkm.sh diff --git a/test/pipeline/test.sh b/src/test/resources/pipeline/test.sh similarity index 100% rename from test/pipeline/test.sh rename to src/test/resources/pipeline/test.sh diff --git a/test/prune-equivalent-translations.py b/src/test/resources/prune-equivalent-translations.py similarity index 100% rename from test/prune-equivalent-translations.py rename to src/test/resources/prune-equivalent-translations.py diff --git a/test/run-all-tests.sh b/src/test/resources/run-all-tests.sh similarity index 100% rename from test/run-all-tests.sh rename to src/test/resources/run-all-tests.sh diff --git a/test/scripts/.gitignore b/src/test/resources/scripts/.gitignore similarity index 100% rename from test/scripts/.gitignore rename to src/test/resources/scripts/.gitignore diff --git a/test/scripts/merge_lms_test.py b/src/test/resources/scripts/merge_lms_test.py similarity index 100% rename from test/scripts/merge_lms_test.py rename to src/test/resources/scripts/merge_lms_test.py diff --git a/test/scripts/normalization/.gitignore b/src/test/resources/scripts/normalization/.gitignore similarity index 100% rename from test/scripts/normalization/.gitignore rename to src/test/resources/scripts/normalization/.gitignore diff --git a/test/scripts/normalization/data/train.en b/src/test/resources/scripts/normalization/data/train.en similarity index 100% rename from test/scripts/normalization/data/train.en rename to src/test/resources/scripts/normalization/data/train.en diff --git a/test/scripts/normalization/data/train.en.norm b/src/test/resources/scripts/normalization/data/train.en.norm similarity index 100% rename from test/scripts/normalization/data/train.en.norm rename to src/test/resources/scripts/normalization/data/train.en.norm diff --git a/test/scripts/normalization/test.sh b/src/test/resources/scripts/normalization/test.sh similarity index 100% rename from test/scripts/normalization/test.sh rename to src/test/resources/scripts/normalization/test.sh diff --git a/test/scripts/run_bundler_test.py b/src/test/resources/scripts/run_bundler_test.py similarity index 100% rename from test/scripts/run_bundler_test.py rename to src/test/resources/scripts/run_bundler_test.py diff --git a/test/scripts/support/moses_grammar/input b/src/test/resources/scripts/support/moses_grammar/input similarity index 100% rename from test/scripts/support/moses_grammar/input rename to src/test/resources/scripts/support/moses_grammar/input diff --git a/test/scripts/support/moses_grammar/output.expected b/src/test/resources/scripts/support/moses_grammar/output.expected similarity index 100% rename from test/scripts/support/moses_grammar/output.expected rename to src/test/resources/scripts/support/moses_grammar/output.expected diff --git a/test/scripts/support/moses_grammar/test.sh b/src/test/resources/scripts/support/moses_grammar/test.sh similarity index 100% rename from test/scripts/support/moses_grammar/test.sh rename to src/test/resources/scripts/support/moses_grammar/test.sh diff --git a/test/server/http/expected b/src/test/resources/server/http/expected similarity index 100% rename from test/server/http/expected rename to src/test/resources/server/http/expected diff --git a/test/server/http/test.sh b/src/test/resources/server/http/test.sh similarity index 100% rename from test/server/http/test.sh rename to src/test/resources/server/http/test.sh diff --git a/test/server/tcp-text/expected b/src/test/resources/server/tcp-text/expected similarity index 100% rename from test/server/tcp-text/expected rename to src/test/resources/server/tcp-text/expected diff --git a/test/server/tcp-text/test.sh b/src/test/resources/server/tcp-text/test.sh similarity index 100% rename from test/server/tcp-text/test.sh rename to src/test/resources/server/tcp-text/test.sh diff --git a/src/test/resources/testng.xml b/src/test/resources/testng.xml new file mode 100644 index 00000000..4908ed13 --- /dev/null +++ b/src/test/resources/testng.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + diff --git a/test/thrax/.gitignore b/src/test/resources/thrax/.gitignore similarity index 100% rename from test/thrax/.gitignore rename to src/test/resources/thrax/.gitignore diff --git a/test/thrax/extraction/input/thrax.conf b/src/test/resources/thrax/extraction/input/thrax.conf similarity index 100% rename from test/thrax/extraction/input/thrax.conf rename to src/test/resources/thrax/extraction/input/thrax.conf diff --git a/test/thrax/extraction/input/train.a b/src/test/resources/thrax/extraction/input/train.a similarity index 100% rename from test/thrax/extraction/input/train.a rename to src/test/resources/thrax/extraction/input/train.a diff --git a/test/thrax/extraction/input/train.en b/src/test/resources/thrax/extraction/input/train.en similarity index 100% rename from test/thrax/extraction/input/train.en rename to src/test/resources/thrax/extraction/input/train.en diff --git a/test/thrax/extraction/input/train.ps b/src/test/resources/thrax/extraction/input/train.ps similarity index 100% rename from test/thrax/extraction/input/train.ps rename to src/test/resources/thrax/extraction/input/train.ps diff --git a/test/thrax/extraction/test.sh b/src/test/resources/thrax/extraction/test.sh similarity index 100% rename from test/thrax/extraction/test.sh rename to src/test/resources/thrax/extraction/test.sh diff --git a/test/thrax/filtering/dev.hi-en.hi.1 b/src/test/resources/thrax/filtering/dev.hi-en.hi.1 similarity index 100% rename from test/thrax/filtering/dev.hi-en.hi.1 rename to src/test/resources/thrax/filtering/dev.hi-en.hi.1 diff --git a/test/thrax/filtering/exact.gold b/src/test/resources/thrax/filtering/exact.gold similarity index 100% rename from test/thrax/filtering/exact.gold rename to src/test/resources/thrax/filtering/exact.gold diff --git a/test/thrax/filtering/exact.log.gold b/src/test/resources/thrax/filtering/exact.log.gold similarity index 100% rename from test/thrax/filtering/exact.log.gold rename to src/test/resources/thrax/filtering/exact.log.gold diff --git a/test/thrax/filtering/fast.gold b/src/test/resources/thrax/filtering/fast.gold similarity index 100% rename from test/thrax/filtering/fast.gold rename to src/test/resources/thrax/filtering/fast.gold diff --git a/test/thrax/filtering/fast.log.gold b/src/test/resources/thrax/filtering/fast.log.gold similarity index 100% rename from test/thrax/filtering/fast.log.gold rename to src/test/resources/thrax/filtering/fast.log.gold diff --git a/test/thrax/filtering/grammar.de b/src/test/resources/thrax/filtering/grammar.de similarity index 100% rename from test/thrax/filtering/grammar.de rename to src/test/resources/thrax/filtering/grammar.de diff --git a/test/thrax/filtering/grammar.filtered.gz b/src/test/resources/thrax/filtering/grammar.filtered.gz similarity index 100% rename from test/thrax/filtering/grammar.filtered.gz rename to src/test/resources/thrax/filtering/grammar.filtered.gz diff --git a/test/thrax/filtering/input.de b/src/test/resources/thrax/filtering/input.de similarity index 100% rename from test/thrax/filtering/input.de rename to src/test/resources/thrax/filtering/input.de diff --git a/test/thrax/filtering/loose.log.gold b/src/test/resources/thrax/filtering/loose.log.gold similarity index 100% rename from test/thrax/filtering/loose.log.gold rename to src/test/resources/thrax/filtering/loose.log.gold diff --git a/test/thrax/filtering/test-exact.sh b/src/test/resources/thrax/filtering/test-exact.sh similarity index 100% rename from test/thrax/filtering/test-exact.sh rename to src/test/resources/thrax/filtering/test-exact.sh diff --git a/test/thrax/filtering/test-fast.sh b/src/test/resources/thrax/filtering/test-fast.sh similarity index 100% rename from test/thrax/filtering/test-fast.sh rename to src/test/resources/thrax/filtering/test-fast.sh diff --git a/test/thrax/filtering/test-loose.sh b/src/test/resources/thrax/filtering/test-loose.sh similarity index 100% rename from test/thrax/filtering/test-loose.sh rename to src/test/resources/thrax/filtering/test-loose.sh diff --git a/test/joshua/corpus/CorpusArrayTest.java b/test/joshua/corpus/CorpusArrayTest.java deleted file mode 100644 index 66e46536..00000000 --- a/test/joshua/corpus/CorpusArrayTest.java +++ /dev/null @@ -1,176 +0,0 @@ -package joshua.corpus; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.Date; -import java.util.logging.Logger; - -import joshua.corpus.CorpusArray; -import joshua.corpus.Phrase; -import joshua.corpus.mm.MemoryMappedCorpusArray; -import joshua.corpus.suffix_array.SuffixArrayFactory; -import joshua.corpus.vocab.Vocabulary; -import joshua.util.FormatUtil; - - -import org.testng.Assert; -import org.testng.annotations.Test; - - - - -public class CorpusArrayTest { - - /** Logger for this class. */ - private static Logger logger = - Logger.getLogger(CorpusArrayTest.class.getName()); - -// @Test -// public void writePartsToDisk() { -// -// String filename = "data/tiny.en"; -// int numSentences = 5; // Should be 5 sentences in tiny.en -// int numWords = 89; // Should be 89 words in tiny.en -// -// -// try { -// -// // FIX: can't use createVocabulary(String) because we set numWords and numSentences -// Vocabulary vocab = new Vocabulary(); -// SuffixArrayFactory.createVocabulary(filename, vocab); -// CorpusArray corpus = SuffixArrayFactory.createCorpusArray(filename, vocab, numWords, numSentences); -// -// corpus.writeWordIDsToFile(filename+".bin"); -// corpus.writeSentenceLengthsToFile(filename+".sbin"); -// -// MemoryMappedCorpusArray mmCorpus = new MemoryMappedCorpusArray(corpus.getVocabulary(), filename+".bin", numWords*4, filename+".sbin", numSentences*4); -// -// // For each word in the corpus, -// for (int i=0; i, , , -pau-*/ - int numBuiltInSymbols = 9; - - /** , , , -pau- */ - int numBuiltInTerminals = 4; - - @Test - public void basicVocabTest() { - - Vocabulary vocab1 = new Vocabulary(); - Vocabulary vocab2 = new Vocabulary(new HashSet()); - - Assert.assertEquals(vocab1, vocab2); - - Assert.assertFalse(vocab1.intToString.isEmpty()); -// Assert.assertTrue(vocab1.intToString.get(0)==Vocabulary.UNKNOWN_WORD_STRING); - Assert.assertFalse(vocab1.getWords().isEmpty()); - Assert.assertTrue(vocab1.getWord(0)==Vocabulary.UNKNOWN_WORD_STRING); - Assert.assertEquals(vocab1.getWords(), vocab1.intToString.values()); - - Assert.assertEquals(vocab1.size(), numBuiltInSymbols); - Assert.assertEquals(vocab1.getWord(Vocabulary.UNKNOWN_WORD), Vocabulary.UNKNOWN_WORD_STRING); - - //Assert.assertEquals(vocab1.getID("sample"), Vocabulary.UNKNOWN_WORD); - //Assert.assertEquals(vocab1.getID(null), Vocabulary.UNKNOWN_WORD); - - Assert.assertFalse(vocab1.terminalToInt.isEmpty()); - Assert.assertEquals(vocab1.terminalToInt.size(), this.numBuiltInTerminals); -// Assert.assertFalse(vocab1.isFixed); -// -// vocab1.fixVocabulary(); -// Assert.assertTrue(vocab1.isFixed); - - Assert.assertEquals(vocab1.getID(Vocabulary.X_STRING), -1); - Assert.assertEquals(vocab1.getID(Vocabulary.X1_STRING), -2); - Assert.assertEquals(vocab1.getID(Vocabulary.X2_STRING), -3); - - Assert.assertEquals(vocab1.getWord(-1), Vocabulary.X_STRING); - Assert.assertEquals(vocab1.getWord(-2), Vocabulary.X1_STRING); - Assert.assertEquals(vocab1.getWord(-3), Vocabulary.X2_STRING); - - - - Assert.assertFalse(vocab2.intToString.isEmpty()); -// Assert.assertTrue(vocab2.intToString.get(0)==Vocabulary.UNKNOWN_WORD_STRING); - Assert.assertFalse(vocab2.getWords().isEmpty()); -// Assert.assertTrue(vocab2.getWord(0)==Vocabulary.UNKNOWN_WORD_STRING); - Assert.assertEquals(vocab2.getWords(), vocab2.intToString.values()); - - Assert.assertEquals(vocab2.size(), numBuiltInSymbols); - Assert.assertEquals(vocab2.getWord(Vocabulary.UNKNOWN_WORD), Vocabulary.UNKNOWN_WORD_STRING); - -// Assert.assertEquals(vocab2.getID("sample"), Vocabulary.UNKNOWN_WORD); -// Assert.assertEquals(vocab2.getID(null), Vocabulary.UNKNOWN_WORD); - - Assert.assertFalse(vocab2.terminalToInt.isEmpty()); - Assert.assertEquals(vocab2.terminalToInt.size(), this.numBuiltInTerminals); -// Assert.assertTrue(vocab2.isFixed); - - - - } - - @Test - public void verifyWordIDs() throws IOException { - - // Adam Lopez's example... - String corpusString = "it makes him and it mars him , it sets him on and it takes him off ."; -// String queryString = "it persuades him and it disheartens him"; - - String sourceFileName; - { - File sourceFile = File.createTempFile("source", new Date().toString()); - PrintStream sourcePrintStream = new PrintStream(sourceFile, "UTF-8"); - sourcePrintStream.println(corpusString); - sourcePrintStream.close(); - sourceFileName = sourceFile.getAbsolutePath(); - } - - Vocabulary vocab = new Vocabulary(); - Vocabulary.initializeVocabulary(sourceFileName, vocab, true); - - Assert.assertEquals(vocab.getWord(vocab.getID("it")), "it"); - Assert.assertEquals(vocab.getWord(vocab.getID("makes")), "makes"); - Assert.assertEquals(vocab.getWord(vocab.getID("him")), "him"); - Assert.assertEquals(vocab.getWord(vocab.getID("and")), "and"); - Assert.assertEquals(vocab.getWord(vocab.getID("mars")), "mars"); - Assert.assertEquals(vocab.getWord(vocab.getID(",")), ","); - Assert.assertEquals(vocab.getWord(vocab.getID("sets")), "sets"); - Assert.assertEquals(vocab.getWord(vocab.getID("on")), "on"); - Assert.assertEquals(vocab.getWord(vocab.getID("takes")), "takes"); - Assert.assertEquals(vocab.getWord(vocab.getID("off")), "off"); - -// Assert.assertEquals(vocab.getWord(vocab.getID("persuades")), Vocabulary.UNKNOWN_WORD_STRING); -// Assert.assertEquals(vocab.getWord(vocab.getID("disheartens")), Vocabulary.UNKNOWN_WORD_STRING); - } - - @Test - public void loadVocabFromFile() { - - String filename = "data/tiny.en"; - int numSentences = 5; // Should be 5 sentences in tiny.en - int numWords = 89; // Should be 89 words in tiny.en - int numUniqWords = 60; // Should be 60 unique words in tiny.en - - Vocabulary vocab = new Vocabulary(); - Vocabulary vocab2 = new Vocabulary(); - - Assert.assertTrue(vocab.equals(vocab2)); - Assert.assertTrue(vocab2.equals(vocab)); - Assert.assertEquals(vocab, vocab2); - - try { - int[] result = Vocabulary.initializeVocabulary(filename, vocab, true); - Assert.assertNotNull(result); - Assert.assertEquals(result.length, 2); - Assert.assertEquals(result[0], numWords); - Assert.assertEquals(result[1], numSentences); - -// Assert.assertTrue(vocab.isFixed); - Assert.assertEquals(vocab.size(), numUniqWords+numBuiltInSymbols); - - } catch (IOException e) { - Assert.fail("Could not load file " + filename); - } - - Assert.assertFalse(vocab.equals(vocab2)); - - try { - int[] result = Vocabulary.initializeVocabulary(filename, vocab2, true); - Assert.assertNotNull(result); - Assert.assertEquals(result.length, 2); - Assert.assertEquals(result[0], numWords); - Assert.assertEquals(result[1], numSentences); - -// Assert.assertTrue(vocab2.isFixed); - Assert.assertEquals(vocab2.size(), numUniqWords+numBuiltInSymbols); - - } catch (IOException e) { - Assert.fail("Could not load file " + filename); - } - - Assert.assertEquals(vocab, vocab2); - } -} diff --git a/test/joshua/decoder/DecoderThreadTest.java b/test/joshua/decoder/DecoderThreadTest.java deleted file mode 100644 index 78e46bd1..00000000 --- a/test/joshua/decoder/DecoderThreadTest.java +++ /dev/null @@ -1,178 +0,0 @@ -/* This file is part of the Joshua Machine Translation System. - * - * Joshua is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free - * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - * MA 02111-1307 USA - */ -package joshua.decoder; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.Date; -import java.util.Scanner; - -import joshua.corpus.Corpus; -import joshua.corpus.alignment.AlignmentGrids; -import joshua.corpus.suffix_array.Compile; -import joshua.corpus.suffix_array.SuffixArrayFactory; -import joshua.corpus.vocab.Vocabulary; -import joshua.prefix_tree.ExtractRules; - -import org.testng.Assert; -import org.testng.annotations.Test; - -/** - * Unit tests for decoder thread. - * - * @author Lane Schwartz - * @version $LastChangedDate$ - */ -public class DecoderThreadTest { - - @Test - public void setup() { - - String[] sourceSentences = { - "a b c d", - "a b c d", - "a b c d" - }; - - String[] targetSentences = { - "w x y z", - "w t u v", - "s x y z" - }; - - String[] alignmentLines = { - "0-0 1-1 2-2 3-3", - "0-0 1-1 2-2 3-3", - "0-0 1-1 2-2 3-3" - }; - - String[] testSentences = { - "a b c" - }; - - try { - - // Set up source corpus - File sourceFile = File.createTempFile("source", new Date().toString()); - PrintStream sourcePrintStream = new PrintStream(sourceFile, "UTF-8"); - for (String sentence : sourceSentences) { - sourcePrintStream.println(sentence); - } - sourcePrintStream.close(); - String sourceCorpusFileName = sourceFile.getAbsolutePath(); - - Vocabulary Vocabulary = new Vocabulary(); - int[] sourceLengths = Vocabulary.initializeVocabulary(sourceCorpusFileName, Vocabulary, true); - Assert.assertEquals(sourceLengths.length, 2); - int numberOfSentences = sourceLengths[1]; - - Corpus sourceCorpus = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, Vocabulary, sourceLengths[0], sourceLengths[1]); - - - // Set up target corpus - File targetFile = File.createTempFile("target", new Date().toString()); - PrintStream targetPrintStream = new PrintStream(targetFile, "UTF-8"); - for (String sentence : targetSentences) { - targetPrintStream.println(sentence); - } - targetPrintStream.close(); - String targetCorpusFileName = targetFile.getAbsolutePath(); - - int[] targetLengths = Vocabulary.initializeVocabulary(targetCorpusFileName, Vocabulary, true); - Assert.assertEquals(targetLengths.length, sourceLengths.length); - for (int i=0, n=targetLengths.length; i counts = new HashMap(); - - boolean iterationOccurred = false; - - for (ArpaNgram ngram : arpaFile) { - - iterationOccurred = true; - - int order = ngram.order(); - // System.err.println("Order = " + order); - - int count; - if (counts.containsKey(order)) { - count = counts.get(order) + 1; - } else { - count = 1; - } - - counts.put(order, count); - - } - - Assert.assertTrue(iterationOccurred); - - Assert.assertTrue(counts.containsKey(1)); - Assert.assertTrue(counts.containsKey(2)); - Assert.assertTrue(counts.containsKey(3)); - - Assert.assertEquals((int) counts.get(1), 8); - Assert.assertEquals((int) counts.get(2), 5); - Assert.assertEquals((int) counts.get(3), 1); - - } - - @Test(dependsOnMethods = { "setup" }) - public void testSize() { - ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab); - - Assert.assertEquals(arpaFile.size(), 14); - } - - @Test(dependsOnMethods = { "setup", "testIteration" }) - public void testChildren() throws FileNotFoundException { - ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab); - - TrieLM lm = new TrieLM(arpaFile); - // System.err.println(lm.getChildren().size()); - Assert.assertNotSame(lm.getChildren().size(), 0); - } - - @Test(dependsOnMethods = { "setup", "testIteration", "testChildren" }) - public void testTrie() throws FileNotFoundException { - ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab); - - TrieLM lm = new TrieLM(arpaFile); - - testLm(lm); - - } - - @Test(dependsOnMethods = { "setup", "testIteration", "testChildren" }) - public void testBerkeley() throws FileNotFoundException { - - LMGrammarBerkeley lm = new LMGrammarBerkeley(vocab, 3, arpaFileName); - - testLm(lm); - - } - - /** - * @param lm - */ - private void testLm(AbstractLM lm) { - // Test unigrams known to be in the language model - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("a")), -1.992672, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because")), -2.713723, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("boycott")), -4.678545, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of")), -1.609573, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("parliament")), -3.875917, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("potato")), -9.753210, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("resumption")), -4.678545, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the")), -1.712444, 0.000001f); - - // Test unigrams known to NOT be in the language model - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("banana")), -JoshuaConfiguration.lm_ceiling_cost, 0.000001f); - - // Test bigrams known to be in the language model - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because of")), -0.3552987, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of the")), -0.7507797, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("resumption of")), -0.7266324, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the resumption")), -3.936147, 0.000001f); - - // Test trigrams known to be in the language model - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because of the")), -0.6309999f, 0.000001f); - - // Test bigrams know to NOT be in the language model (but the unigrams are) - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("a boycott")), -4.678545f + -0.1195484f, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of parliament")), -3.875917f + -0.1991907f, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the potato")), -9.753210f + -0.1606644f, 0.000001f); - Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("potato parliament")), -3.875917f + -0.0f, 0.000001f); - - // Test trigrams know to NOT be in the language model (but the bigrams are) - int[] words = vocab.getIDs("because of a"); - double f = lm.ngramLogProbability(words); - Assert.assertEquals(f, -1.403534f + -0.03083654f, 0.000001f); - // //Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of the parliament")), -3.875917f + -0.05237135f, 0.000001f); - } -} diff --git a/test/joshua/lattice/ArcTest.java b/test/joshua/lattice/ArcTest.java deleted file mode 100644 index 51b3bb86..00000000 --- a/test/joshua/lattice/ArcTest.java +++ /dev/null @@ -1,82 +0,0 @@ -/* This file is part of the Joshua Machine Translation System. - * - * Joshua is free software; you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public - * License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package joshua.lattice; - -import org.testng.Assert; -import org.testng.annotations.Test; - -/** - * Unit tests for Arc class. - * - * @author Lane Schwartz - * @since 2008-07-09 - * @version $LastChangedDate$ - */ -@Test(groups = { "lattice_arc" }) -public class ArcTest { - - private final Node head = new Node(1); - private final Node tail = new Node(2); - private final double cost = Math.PI; - private final String label = "pi"; - - private Arc arc; - - @Test(dependsOnMethods = { "joshua.lattice.NodeTest.constructNode" }) - //@Test(dependsOnGroups = {"lattice_node" }) - public void constructArc() { - - arc = new Arc(head, tail, cost, label); - - Assert.assertEquals(arc.head, head); - Assert.assertEquals(arc.tail, tail); - Assert.assertEquals(arc.cost, cost); - Assert.assertEquals(arc.label, label); - - } - - @Test(dependsOnMethods = { "constructArc" }) - public void getHead() { - - Assert.assertEquals(arc.getHead(), head); - - } - - - @Test(dependsOnMethods = { "constructArc" }) - public void getTail() { - - Assert.assertEquals(arc.getTail(), tail); - - } - - - @Test(dependsOnMethods = { "constructArc" }) - public void getCost() { - - Assert.assertEquals(arc.getCost(), cost); - - } - - - @Test(dependsOnMethods = { "constructArc" }) - public void getLabel() { - - Assert.assertEquals(arc.getLabel(), label); - - } -} diff --git a/test/joshua/lattice/LatticeTest.java b/test/joshua/lattice/LatticeTest.java deleted file mode 100644 index d0957b78..00000000 --- a/test/joshua/lattice/LatticeTest.java +++ /dev/null @@ -1,194 +0,0 @@ -/* This file is part of the Joshua Machine Translation System. - * - * Joshua is free software; you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public - * License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package joshua.lattice; - -import java.util.ArrayList; -import java.util.List; - -import org.testng.Assert; -import org.testng.annotations.Test; - -/** - * Unit tests for Lattice class. - * - * @author Lane Schwartz - * @since 2008-07-09 - * @version $LastChangedDate$ - */ -@Test(groups = { "lattice" }) -public class LatticeTest { - - @Test - public void allPairsShortestPath() { - - List> nodes = new ArrayList>(); - for (int i=0; i<4; i++) { - nodes.add(new Node(i)); - } - - nodes.get(0).addArc(nodes.get(1), 1.0, "x"); - nodes.get(1).addArc(nodes.get(2), 1.0, "y"); - nodes.get(0).addArc(nodes.get(2), 1.5, "a"); - nodes.get(2).addArc(nodes.get(3), 3.0, "b"); - nodes.get(2).addArc(nodes.get(3), 5.0, "c"); - - Lattice graph = new Lattice(nodes); - - Assert.assertEquals(graph.getShortestPath(0, 1), 1.0); - Assert.assertEquals(graph.getShortestPath(0, 2), 1.0); - Assert.assertEquals(graph.getShortestPath(1, 2), 1.0); - Assert.assertEquals(graph.getShortestPath(0, 3), 2.0); - Assert.assertEquals(graph.getShortestPath(1, 3), 2.0); - Assert.assertEquals(graph.getShortestPath(2, 3), 1.0); - } - - @Test - public void createFromString() { - - String data = - - // Start of lattice - "("+ - - // Node 0 - "("+ - "('A',1.0,5),"+ // Arc with label A and cost 1.0. Destination is Node 5 (Node 0 + span of 5) - "('B',1.0,2),"+ // Arc with label B and cost 1.0. Destination is Node 2 (Node 0 + span of 2) - "('C',1.0,3),"+ // Arc with label C and cost 1.0. Destination is Node 3 (Node 0 + span of 3) - "('D',1.0,1),"+ // Arc with label D and cost 1.0. Destination is Node 1 (Node 0 + span of 1) - ")," + - - // Node 1 - "(" + - "('E',1.0,4)," + // Arc with label E and cost 1.0. Destination is Node 5 (Node 1 + span of 4) - ")," + - - // Node 2 - "(" + - "('C',1.0,3)," + // Arc with label C and cost 1.0. Destination is Node 5 (Node 2 + span of 3) - ")," + - - // Node 3 - "(" + - "('D',1.0,1)," + // Arc with label D and cost 1.0. Destination is Node 4 (Node 3 + span of 1) - ")," + - - // Node 4 - "(" + - "('E',1.0,1)," + // Arc with label E and cost 1.0. Destination is Node 5 (Node 4 + span of 1) - ")," + - - // Node 5 - "(" + - "('X',1.0,1)," + // Arc with label X and cost 1.0. Destination is Node 6 (Node 5 + span of 1) - ")," + - - // There is an implicit final state (Node 6). - - ")"; // End of lattice - - - Lattice lattice = Lattice.createFromString(data); - - int numberOfNodes = 7; - - Assert.assertEquals(lattice.size(), numberOfNodes); - - Node node0 = lattice.getNode(0); - Node node1 = lattice.getNode(1); - Node node2 = lattice.getNode(2); - Node node3 = lattice.getNode(3); - Node node4 = lattice.getNode(4); - Node node5 = lattice.getNode(5); - Node node6 = lattice.getNode(6); - - Assert.assertEquals(node0.size(), 4); - Assert.assertEquals(node1.size(), 1); - Assert.assertEquals(node2.size(), 1); - Assert.assertEquals(node3.size(), 1); - Assert.assertEquals(node4.size(), 1); - Assert.assertEquals(node5.size(), 1); - Assert.assertEquals(node6.size(), 0); - - - // Node 0 outgoing arcs - - Arc arcA_0_5 = node0.outgoingArcs.get(0); - Assert.assertEquals(arcA_0_5.getLabel(), "A"); - Assert.assertEquals(arcA_0_5.getHead(), node0); - Assert.assertEquals(arcA_0_5.getTail(), node5); - Assert.assertEquals(arcA_0_5.getCost(), 1.0); - - Arc arcB_0_2 = node0.outgoingArcs.get(1); - Assert.assertEquals(arcB_0_2.getLabel(), "B"); - Assert.assertEquals(arcB_0_2.getHead(), node0); - Assert.assertEquals(arcB_0_2.getTail(), node2); - Assert.assertEquals(arcB_0_2.getCost(), 1.0); - - Arc arcC_0_3 = node0.outgoingArcs.get(2); - Assert.assertEquals(arcC_0_3.getLabel(), "C"); - Assert.assertEquals(arcC_0_3.getHead(), node0); - Assert.assertEquals(arcC_0_3.getTail(), node3); - Assert.assertEquals(arcC_0_3.getCost(), 1.0); - - Arc arcD_0_1 = node0.outgoingArcs.get(3); - Assert.assertEquals(arcD_0_1.getLabel(), "D"); - Assert.assertEquals(arcD_0_1.getHead(), node0); - Assert.assertEquals(arcD_0_1.getTail(), node1); - Assert.assertEquals(arcD_0_1.getCost(), 1.0); - - - // Node 1 outgoing arcs - Arc arcE_1_5 = node1.outgoingArcs.get(0); - Assert.assertEquals(arcE_1_5.getLabel(), "E"); - Assert.assertEquals(arcE_1_5.getHead(), node1); - Assert.assertEquals(arcE_1_5.getTail(), node5); - Assert.assertEquals(arcE_1_5.getCost(), 1.0); - - - // Node 2 outgoing arcs - Arc arcC_2_5 = node2.outgoingArcs.get(0); - Assert.assertEquals(arcC_2_5.getLabel(), "C"); - Assert.assertEquals(arcC_2_5.getHead(), node2); - Assert.assertEquals(arcC_2_5.getTail(), node5); - Assert.assertEquals(arcC_2_5.getCost(), 1.0); - - - // Node 3 outgoing arcs - Arc arcD_3_4 = node3.outgoingArcs.get(0); - Assert.assertEquals(arcD_3_4.getLabel(), "D"); - Assert.assertEquals(arcD_3_4.getHead(), node3); - Assert.assertEquals(arcD_3_4.getTail(), node4); - Assert.assertEquals(arcD_3_4.getCost(), 1.0); - - - // Node 4 outgoing arcs - Arc arcE_4_5 = node4.outgoingArcs.get(0); - Assert.assertEquals(arcE_4_5.getLabel(), "E"); - Assert.assertEquals(arcE_4_5.getHead(), node4); - Assert.assertEquals(arcE_4_5.getTail(), node5); - Assert.assertEquals(arcE_1_5.getCost(), 1.0); - - - // Node 5 outgoing arcs - Arc arcX_5_6 = node5.outgoingArcs.get(0); - Assert.assertEquals(arcX_5_6.getLabel(), "X"); - Assert.assertEquals(arcX_5_6.getHead(), node5); - Assert.assertEquals(arcX_5_6.getTail(), node6); - Assert.assertEquals(arcX_5_6.getCost(), 1.0); - } -} diff --git a/test/joshua/lattice/NodeTest.java b/test/joshua/lattice/NodeTest.java deleted file mode 100644 index 147c7fe1..00000000 --- a/test/joshua/lattice/NodeTest.java +++ /dev/null @@ -1,106 +0,0 @@ -/* This file is part of the Joshua Machine Translation System. - * - * Joshua is free software; you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public - * License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package joshua.lattice; - -import org.testng.Assert; -import org.testng.annotations.Test; - -/** - * Unit tests for Node class. - * - * @author Lane Schwartz - * @since 2008-07-09 - * @version $LastChangedDate$ - */ -@Test(groups = { "lattice_node" }) -public class NodeTest { - - private final int id = 12345; - - private Node node; - - @Test - public void constructNode() { - - node = new Node(id); - - Assert.assertEquals((int) node.id, (int) id); - Assert.assertTrue(node.outgoingArcs.isEmpty()); - Assert.assertEquals(node.size(), 0); - - } - - - @Test(dependsOnMethods = { "constructNode" }) - public void getNumber() { - - Assert.assertEquals(node.getNumber(), id); - - } - - - @Test(dependsOnMethods = { "constructNode" }) - public void toStringTest() { - - Assert.assertEquals(node.toString(), "Node-"+id); - - } - - - @Test(dependsOnMethods = { "constructNode", "joshua.lattice.ArcTest.constructArc" }) - public void addArc() { - - Node n2 = new Node(2); - double w2 = 0.123; - String l2 = "somthing cool"; - - Node n3 = new Node(3); - double w3 = 124.78; - String l3 = "hurray!"; - - Node n4 = new Node(4); - double w4 = Double.POSITIVE_INFINITY; - String l4 = "\u0000"; - - Assert.assertEquals(node.size(), 0); - - node.addArc(n2, w2, l2); - Assert.assertEquals(node.size(), 1); - Arc a2 = node.outgoingArcs.get(0); - Assert.assertEquals(a2.head, node); - Assert.assertEquals(a2.tail, n2); - Assert.assertEquals(a2.cost, w2); - Assert.assertEquals(a2.label, l2); - - node.addArc(n3, w3, l3); - Assert.assertEquals(node.size(), 2); - Arc a3 = node.outgoingArcs.get(1); - Assert.assertEquals(a3.head, node); - Assert.assertEquals(a3.tail, n3); - Assert.assertEquals(a3.cost, w3); - Assert.assertEquals(a3.label, l3); - - node.addArc(n4, w4, l4); - Assert.assertEquals(node.size(), 3); - Arc a4 = node.outgoingArcs.get(2); - Assert.assertEquals(a4.head, node); - Assert.assertEquals(a4.tail, n4); - Assert.assertEquals(a4.cost, w4); - Assert.assertEquals(a4.label, l4); - - } -} diff --git a/test/joshua/ui/tree_visualizer/tree/TreeTest.java b/test/joshua/ui/tree_visualizer/tree/TreeTest.java deleted file mode 100644 index 454c018b..00000000 --- a/test/joshua/ui/tree_visualizer/tree/TreeTest.java +++ /dev/null @@ -1,93 +0,0 @@ -package joshua.ui.tree_visualizer.tree; - -import java.util.List; - -import org.testng.Assert; -import org.testng.annotations.Test; - -public class TreeTest { - @Test(expectedExceptions = { IllegalArgumentException.class }) - public void ctor_EmptyString_IllegalArgument() { - Tree tree = new Tree(""); - Assert.assertEquals(tree.size(), 0); - } - - @Test(expectedExceptions = { IllegalArgumentException.class }) - public void ctor_TooFewCloseParens_IllegalArgument() { - Tree tree = new Tree("(A{0-1} foo"); - Assert.assertEquals(tree.size(), 0); - } - - @Test - public void simpleTree_correctSize() { - Tree tree = new Tree("(A{0-1} foo)"); - Assert.assertEquals(tree.size(), 2); - } - - @Test - public void simpleTree_correctRoot() { - Tree tree = new Tree("(A{0-1} foo)"); - Tree.Node root = tree.root(); - Assert.assertEquals(root.label(), "A"); - Assert.assertEquals(root.sourceStartIndex(), 0); - Assert.assertEquals(root.sourceEndIndex(), 1); - Assert.assertEquals(root.children().size(), 1); - } - - @Test - public void simpleTree_correctLeaf() { - Tree tree = new Tree("(A{0-1} foo)"); - Tree.Node leaf = tree.root().children().get(0); - Assert.assertEquals(leaf.label(), "foo"); - Assert.assertEquals(leaf.sourceStartIndex(), -1); - Assert.assertEquals(leaf.sourceEndIndex(), -1); - Assert.assertEquals(leaf.children().size(), 0); - } - - @Test - public void simpleTree_toString() { - Tree tree = new Tree("(A{0-1} foo)"); - Assert.assertEquals(tree.toString(), "(A{0-1} foo)"); - } - - @Test - public void trickyTree_children() { - Tree tree = new Tree("(A{0-2} foo (B{1-2} bar))"); - List children = tree.root().children(); - Assert.assertEquals(children.size(), 2); - Tree.Node foo = children.get(0); - Assert.assertEquals(foo.label(), "foo"); - Assert.assertTrue(foo.isLeaf()); - Assert.assertEquals(foo.sourceStartIndex(), -1); - Assert.assertEquals(foo.sourceEndIndex(), -1); - Tree.Node b = children.get(1); - Assert.assertEquals(b.label(), "B"); - Assert.assertEquals(b.children().size(), 1); - Assert.assertFalse(b.isLeaf()); - Assert.assertEquals(b.sourceStartIndex(), 1); - Assert.assertEquals(b.sourceEndIndex(), 2); - } - - @Test - public void SourceStartComparator() { - Tree tree = new Tree("(A{0-2} foo (B{1-2} bar))"); - Tree.Node a = tree.root(); - Tree.Node b = a.children().get(1); - Tree.NodeSourceStartComparator cmp = new Tree.NodeSourceStartComparator(); - Assert.assertTrue(cmp.compare(a, b) < 0); - } - - @Test - public void SourceStartComparator_LeafSmallerThanAllInternals() { - Tree tree = new Tree("(A{0-2} foo (B{1-2} bar))"); - Tree.Node a = tree.root(); - Tree.Node foo = a.children().get(0); - Tree.Node b = a.children().get(1); - Tree.Node bar = b.children().get(0); - Tree.NodeSourceStartComparator cmp = new Tree.NodeSourceStartComparator(); - Assert.assertTrue(cmp.compare(foo, a) < 0); - Assert.assertTrue(cmp.compare(foo, b) < 0); - Assert.assertTrue(cmp.compare(bar, a) < 0); - Assert.assertTrue(cmp.compare(bar, b) < 0); - } -} diff --git a/test/joshua/util/CacheTest.java b/test/joshua/util/CacheTest.java deleted file mode 100644 index a2eeef02..00000000 --- a/test/joshua/util/CacheTest.java +++ /dev/null @@ -1,35 +0,0 @@ -package joshua.util; - -import org.testng.Assert; -import org.testng.annotations.Test; - -public class CacheTest { - - @Test - public void test() { - - Cache cache = new Cache(5); - - cache.put("a", 1); - cache.put("b", 2); - cache.put("c", 3); - cache.put("d", 4); - cache.put("e", 5); - - Assert.assertTrue(cache.containsKey("a")); - Assert.assertTrue(cache.containsKey("b")); - Assert.assertTrue(cache.containsKey("c")); - Assert.assertTrue(cache.containsKey("d")); - Assert.assertTrue(cache.containsKey("e")); - - // Access the "a" element in the cache - cache.get("a"); - - // Now add a new element that exceeds the capacity of the cache - cache.put("f", 6); - - Assert.assertTrue(cache.containsKey("a")); - - } - -} diff --git a/test/joshua/util/io/BinaryTest.java b/test/joshua/util/io/BinaryTest.java deleted file mode 100644 index cda8aba6..00000000 --- a/test/joshua/util/io/BinaryTest.java +++ /dev/null @@ -1,58 +0,0 @@ -package joshua.util.io; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectInput; -import java.io.ObjectOutput; -import java.util.HashSet; -import java.util.Set; - -import joshua.corpus.vocab.Vocabulary; - -import org.testng.Assert; -import org.testng.annotations.Test; - -public class BinaryTest { - - - @Test - public void externalizeVocabulary() throws IOException, ClassNotFoundException { - - Set words = new HashSet(); - - for (char c1='a'; c1<='z'; c1++) { - words.add(new String(new char[]{c1})); - for (char c2='a'; c2<='z'; c2++) { - words.add(new String(new char[]{c1,c2})); - } - } - - Vocabulary vocab = new Vocabulary(words); - - try { - - File tempFile = File.createTempFile(BinaryTest.class.getName(), "vocab"); - FileOutputStream outputStream = new FileOutputStream(tempFile); - ObjectOutput out = new BinaryOut(outputStream, true); - vocab.writeExternal(out); - - ObjectInput in = new BinaryIn(tempFile.getAbsolutePath(), Vocabulary.class); - Object o = in.readObject(); - Assert.assertTrue(o instanceof Vocabulary); - - Vocabulary newVocab = (Vocabulary) o; - - Assert.assertNotNull(newVocab); - Assert.assertEquals(newVocab.size(), vocab.size()); - - Assert.assertEquals(newVocab, vocab); - - - - - } catch (SecurityException e) { - Assert.fail("Operating system is unable to create a temp file required by this unit test: " + e); - } - } -} diff --git a/test/joshua/zmert/BLEUTest.java b/test/joshua/zmert/BLEUTest.java deleted file mode 100644 index 79fe8341..00000000 --- a/test/joshua/zmert/BLEUTest.java +++ /dev/null @@ -1,133 +0,0 @@ -/* This file is part of the Joshua Machine Translation System. - * - * Joshua is free software; you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public - * License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package joshua.zmert; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.Scanner; - -import joshua.zmert.BLEU; -import joshua.zmert.EvaluationMetric; - -import org.testng.Assert; -import org.testng.annotations.Parameters; -import org.testng.annotations.Test; - -/** - * Unit tests for BLEU class. - * - * @author Lane Schwartz - * @version $LastChangedDate$ - */ -public class BLEUTest { - - @Test - public void metricName() { - - // Setup the EvaluationMetric class - EvaluationMetric.set_numSentences(0); - EvaluationMetric.set_refsPerSen(1); - EvaluationMetric.set_refSentences(null); - - BLEU bleu = new BLEU(); - - Assert.assertEquals(bleu.get_metricName(), "BLEU"); - - } - - @Test - public void defaultConstructor() { - - // Setup the EvaluationMetric class - EvaluationMetric.set_numSentences(0); - EvaluationMetric.set_refsPerSen(1); - EvaluationMetric.set_refSentences(null); - - BLEU bleu = new BLEU(); - - // Default constructor should use a maximum n-gram length of 4 - Assert.assertEquals(bleu.maxGramLength, 4); - - // Default constructor should use the closest reference - Assert.assertEquals(bleu.effLengthMethod, BLEU.EffectiveLengthMethod.CLOSEST); - - } - - @Test - public void simpleTest() { - - String ref = "this is the fourth chromosome whose sequence has been completed to date . it comprises more than 87 million pairs of dna ."; - String test = "this is the fourth chromosome to be fully sequenced up till now and it comprises of over 87 million pairs of deoxyribonucleic acid ( dna ) ."; - - // refSentences[i][r] stores the r'th reference of the i'th sentence - String[][] refSentences = new String[1][1]; - refSentences[0][0] = ref; - - EvaluationMetric.set_numSentences(1); - EvaluationMetric.set_refsPerSen(1); - EvaluationMetric.set_refSentences(refSentences); - - BLEU bleu = new BLEU(); - - // testSentences[i] stores the candidate translation for the i'th sentence - String[] testSentences = new String[1]; - testSentences[0] = test; - try { - // Check BLEU score matches - double actualScore = bleu.score(testSentences); - double expectedScore = 0.2513; - double acceptableScoreDelta = 0.00001f; - - Assert.assertEquals(actualScore, expectedScore, acceptableScoreDelta); - - // Check sufficient statistics match - int[] actualSS = bleu.suffStats(testSentences); - int[] expectedSS = {14,27,8,26,5,25,3,24,27,23}; - - Assert.assertEquals(actualSS[0], expectedSS[0], 0); // 1-gram matches - Assert.assertEquals(actualSS[1], expectedSS[1], 0); // 1-gram total - Assert.assertEquals(actualSS[2], expectedSS[2], 0); // 2-gram matches - Assert.assertEquals(actualSS[3], expectedSS[3], 0); // 2-gram total - Assert.assertEquals(actualSS[4], expectedSS[4], 0); // 3-gram matches - Assert.assertEquals(actualSS[5], expectedSS[5], 0); // 3-gram total - Assert.assertEquals(actualSS[6], expectedSS[6], 0); // 4-gram matches - Assert.assertEquals(actualSS[7], expectedSS[7], 0); // 4-gram total - Assert.assertEquals(actualSS[8], expectedSS[8], 0); // candidate length - Assert.assertEquals(actualSS[9], expectedSS[9], 0); // reference length - } catch (Exception e) { - Assert.fail(); - } - } - - @Parameters({"referenceFile","testFile"}) - @Test - public void fileTest(String referenceFile, String testFile) throws FileNotFoundException { - - //TODO You can now read in the files, and do something useful with them. - - Scanner refScanner = new Scanner(new File(referenceFile)); - - while (refScanner.hasNextLine()) { - - String refLine = refScanner.nextLine(); - - } - - - } - -} diff --git a/test/packed/Benchmark.java b/test/packed/Benchmark.java deleted file mode 100644 index 55e5d503..00000000 --- a/test/packed/Benchmark.java +++ /dev/null @@ -1,104 +0,0 @@ -package packed; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.nio.IntBuffer; -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.channels.FileChannel.MapMode; -import java.util.Random; -import java.util.logging.Logger; - -/** - * This program runs a little benchmark to check reading speed on various data - * representations. - * - * Usage: java Benchmark PACKED_GRAMMAR_DIR TIMES - */ - -public class Benchmark { - private static final Logger logger = Logger.getLogger(Benchmark.class.getName()); - - private IntBuffer intBuffer; - private MappedByteBuffer byteBuffer; - private int[] intArray; - - public Benchmark(String dir) throws IOException { - File file = new File(dir + "/slice_00000.source"); - - FileChannel source_channel = new FileInputStream(file).getChannel(); - int byte_size = (int) source_channel.size(); - int int_size = byte_size / 4; - - byteBuffer = source_channel.map(MapMode.READ_ONLY, 0, byte_size); - intBuffer = byteBuffer.asIntBuffer(); - - intArray = new int[int_size]; - intBuffer.get(intArray); - } - - public void benchmark(int times) { - logger.info("Beginning benchmark."); - - Random r = new Random(); - r.setSeed(1234567890); - int[] positions = new int[1000]; - for (int i = 0; i < positions.length; i++) - positions[i] = r.nextInt(intArray.length); - - long sum; - - long start_time = System.currentTimeMillis(); - - sum = 0; - for (int t = 0; t < times; t++) - for (int i = 0; i < positions.length; i++) - sum += byteBuffer.getInt(positions[i] * 4); - logger.info("Sum: " + sum); - long byte_time = System.currentTimeMillis(); - - sum = 0; - for (int t = 0; t < times; t++) - for (int i = 0; i < positions.length; i++) - sum += intBuffer.get(positions[i]); - logger.info("Sum: " + sum); - long int_time = System.currentTimeMillis(); - - sum = 0; - for (int t = 0; t < times; t++) - for (int i = 0; i < positions.length; i++) - sum += intArray[positions[i]]; - logger.info("Sum: " + sum); - long array_time = System.currentTimeMillis(); - - sum = 0; - for (int t = 0; t < times; t++) - for (int i = 0; i < (intArray.length / 8); i++) - sum += intArray[i * 6] + intArray[i * 6 + 2]; - logger.info("Sum: " + sum); - long mult_time = System.currentTimeMillis(); - - sum = 0; - for (int t = 0; t < times; t++) { - int index = 0; - for (int i = 0; i < (intArray.length / 8); i++) { - sum += intArray[index] + intArray[index + 2]; - index += 6; - } - } - logger.info("Sum: " + sum); - long add_time = System.currentTimeMillis(); - - logger.info("ByteBuffer: " + (byte_time - start_time)); - logger.info("IntBuffer: " + (int_time - byte_time)); - logger.info("Array: " + (array_time - int_time)); - logger.info("Multiply: " + (mult_time - array_time)); - logger.info("Add: " + (add_time - mult_time)); - } - - public static void main(String args[]) throws IOException { - Benchmark pr = new Benchmark(args[0]); - pr.benchmark( Integer.parseInt(args[1])); - } -} diff --git a/test/packed/CountRules.java b/test/packed/CountRules.java deleted file mode 100644 index 9c745e69..00000000 --- a/test/packed/CountRules.java +++ /dev/null @@ -1,92 +0,0 @@ -package packed; - -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.channels.FileChannel.MapMode; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; - -import joshua.corpus.Vocabulary; - -/** - * This program reads a packed representation and prints out some - * basic information about it. - * - * Usage: java CountRules PACKED_GRAMMAR_DIR - */ - -public class CountRules { - - public static void main(String args[]) { - - String dir = args[0]; - - File file = new File(dir + "/chunk_00000.source"); - FileInputStream stream = null; - FileChannel channel = null; - try { - // read the vocabulary - Vocabulary.read(dir + "/vocabulary"); - - // get the channel etc - stream = new FileInputStream(file); - channel = stream.getChannel(); - int size = (int) channel.size(); - - MappedByteBuffer buffer = channel.map(MapMode.READ_ONLY, 0, size); - // byte[] bytes = new bytes[size]; - // buffer.get(bytes); - - // read the number of rules - int numRules = buffer.getInt(); - System.out.println(String.format("There are %d source sides at the root", numRules)); - - // read the first symbol and its offset - for (int i = 0; i < numRules; i++) { - // String symbol = Vocabulary.word(buffer.getInt()); - int symbol = buffer.getInt(); - String string = Vocabulary.word(symbol); - int offset = buffer.getInt(); - System.out.println(String.format("-> %s/%d [%d]", string, symbol, offset)); - } - - } catch (IOException e) { - - e.printStackTrace(); - - } finally { - try { - if (stream != null) - stream.close(); - - if (channel != null) - channel.close(); - - } catch (IOException e) { - - e.printStackTrace(); - - } - } - - - // // Read in the bytes - // int offset = 0; - // int numRead = 0; - // while (offset < bytes.length - // && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) { - // offset += numRead; - // } - - // // Ensure all the bytes have been read in - // if (offset < bytes.length) { - // throw new IOException("Could not completely read file "+file.getName()); - // } - - // // Close the input stream and return bytes - // is.close(); - // return bytes; - } -} diff --git a/test/packed/PrintRules.java b/test/packed/PrintRules.java deleted file mode 100644 index 8d3650d7..00000000 --- a/test/packed/PrintRules.java +++ /dev/null @@ -1,177 +0,0 @@ -package packed; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.nio.IntBuffer; -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.channels.FileChannel.MapMode; - -import joshua.corpus.Vocabulary; -import joshua.util.quantization.Quantizer; -import joshua.util.quantization.QuantizerConfiguration; - -/** - * This program reads a packed representation and prints out some basic - * information about it. - * - * Usage: java PrintRules PACKED_GRAMMAR_DIR - */ - -public class PrintRules { - - private QuantizerConfiguration quantization; - - private int[] source; - private int[] target; - private MappedByteBuffer features; - private MappedByteBuffer alignments; - - private int[] featureLookup; - private int[] alignmentLookup; - - private boolean have_alignments; - - public PrintRules(String dir) throws IOException { - File source_file = new File(dir + "/slice_00000.source"); - File target_file = new File(dir + "/slice_00000.target"); - File feature_file = new File(dir + "/slice_00000.features"); - File alignment_file = new File(dir + "/slice_00000.alignments"); - - have_alignments = alignment_file.exists(); - - // Read the vocabulary. - Vocabulary.read(dir + "/vocabulary"); - - // Read the quantizer setup. - quantization = new QuantizerConfiguration(); - quantization.read(dir + "/quantization"); - - // Get the channels etc. - FileChannel source_channel = new FileInputStream(source_file).getChannel(); - int source_size = (int) source_channel.size(); - IntBuffer source_buffer = source_channel.map(MapMode.READ_ONLY, 0, - source_size).asIntBuffer(); - source = new int[source_size / 4]; - source_buffer.get(source); - - FileChannel target_channel = new FileInputStream(target_file).getChannel(); - int target_size = (int) target_channel.size(); - IntBuffer target_buffer = target_channel.map(MapMode.READ_ONLY, 0, - target_size).asIntBuffer(); - target = new int[target_size / 4]; - target_buffer.get(target); - - FileChannel feature_channel = new FileInputStream(feature_file).getChannel(); - int feature_size = (int) feature_channel.size(); - features = feature_channel.map(MapMode.READ_ONLY, 0, feature_size); - - if (have_alignments) { - FileChannel alignment_channel = new FileInputStream(alignment_file).getChannel(); - int alignment_size = (int) alignment_channel.size(); - alignments = alignment_channel.map(MapMode.READ_ONLY, 0, alignment_size); - } - - int num_feature_blocks = features.getInt(); - featureLookup = new int[num_feature_blocks]; - // Read away data size. - features.getInt(); - for (int i = 0; i < num_feature_blocks; i++) - featureLookup[i] = features.getInt(); - - int num_alignment_blocks = alignments.getInt(); - alignmentLookup = new int[num_alignment_blocks]; - // Read away data size. - alignments.getInt(); - for (int i = 0; i < num_alignment_blocks; i++) - alignmentLookup[i] = alignments.getInt(); - - if (num_alignment_blocks != num_feature_blocks) - throw new RuntimeException("Number of blocks doesn't match up."); - } - - public void traverse() { - traverse(0, ""); - } - - private void traverse(int position, String src_side) { - int num_children = source[position]; - int[] addresses = new int[num_children]; - int[] symbols = new int[num_children]; - int j = position + 1; - for (int i = 0; i < num_children; i++) { - symbols[i] = source[j++]; - addresses[i] = source[j++]; - } - int num_rules = source[j++]; - for (int i = 0; i < num_rules; i++) { - int lhs = source[j++]; - int tgt_address = source[j++]; - int data_address = source[j++]; - printRule(src_side, lhs, tgt_address, data_address); - } - for (int i = 0; i < num_children; i++) { - traverse(addresses[i], src_side + " " + Vocabulary.word(symbols[i])); - } - } - - private String getTarget(int pointer) { - StringBuilder sb = new StringBuilder(); - do { - pointer = target[pointer]; - if (pointer != -1) { - int symbol = target[pointer + 1]; - if (symbol < 0) - sb.append(" ").append("NT" + symbol); - else - sb.append(" ").append(Vocabulary.word(symbol)); - } - } while (pointer != -1); - return sb.toString(); - } - - private String getFeatures(int block_id) { - StringBuilder sb = new StringBuilder(); - - int data_position = featureLookup[block_id]; - int num_features = features.getInt(data_position); - data_position += 4; - for (int i = 0; i < num_features; i++) { - int feature_id = features.getInt(data_position); - Quantizer quantizer = quantization.get(feature_id); - sb.append(" " + Vocabulary.word(feature_id) + "=" + - quantizer.read(features, data_position)); - data_position += 4 + quantizer.size(); - } - return sb.toString(); - } - - private String getAlignments(int block_id) { - StringBuilder sb = new StringBuilder(); - - int data_position = alignmentLookup[block_id]; - byte num_points = alignments.get(data_position); - for (int i = 0; i < num_points; i++) { - byte src = alignments.get(data_position + 1 + 2 * i); - byte tgt = alignments.get(data_position + 2 + 2 * i); - - sb.append(" " + src + "-" + tgt); - } - return sb.toString(); - } - - private void printRule(String src_side, int lhs, int tgt_address, - int data_address) { - System.out.println(Vocabulary.word(lhs) + " |||" + - src_side + " |||" + - getTarget(tgt_address) + " |||" + - getFeatures(data_address) + - (have_alignments ? " |||" + getAlignments(data_address) : "")); - } - - public static void main(String args[]) throws IOException { - PrintRules pr = new PrintRules(args[0]); - pr.traverse(); - } -} diff --git a/test/packed/VocabTest.java b/test/packed/VocabTest.java deleted file mode 100644 index 3c902056..00000000 --- a/test/packed/VocabTest.java +++ /dev/null @@ -1,33 +0,0 @@ -package packed; - -import java.io.IOException; - -import joshua.corpus.Vocabulary; - -public class VocabTest { - public static void main(String args[]) { - - int numWords = 0; - try { - String dir = args[0]; - - boolean read = Vocabulary.read(dir + "/vocabulary"); - if (! read) { - System.err.println("VocabTest: Failed to read the vocabulary."); - System.exit(1); - } - - int id = 0; - while (Vocabulary.hasId(id)) { - String word = Vocabulary.word(id); - System.out.println(String.format("VOCAB: %d\t%s", id, word)); - numWords++; - id++; - } - } catch (IOException e) { - ; - } - - System.out.println("read " + numWords + " words"); - } -} diff --git a/test/testng.xml b/test/testng.xml deleted file mode 100644 index c5b42848..00000000 --- a/test/testng.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - diff --git a/thrax b/thrax deleted file mode 160000 index e6195e4a..00000000 --- a/thrax +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e6195e4a1f60edc58448e8922991fe6938c6daba