Skip to content
127 changes: 52 additions & 75 deletions unstructured/metrics/text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,57 @@

from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation

_DOUBLE_QUOTES = {
'"': "U+0022", # noqa 601 # Standard typewriter/programmer's quote
'"': "U+201C", # noqa 601 # Left double quotation mark
'"': "U+201D", # noqa 601 # Right double quotation mark
"„": "U+201E", # Double low-9 quotation mark
"‟": "U+201F", # Double high-reversed-9 quotation mark
"«": "U+00AB", # Left-pointing double angle quotation mark
"»": "U+00BB", # Right-pointing double angle quotation mark
"❝": "U+275D", # Heavy double turned comma quotation mark ornament
"❞": "U+275E", # Heavy double comma quotation mark ornament
"⹂": "U+2E42", # Double low-reversed-9 quotation mark
"🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
"🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
"🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
"⠦": "U+2826", # Braille double closing quotation mark
"⠴": "U+2834", # Braille double opening quotation mark
"〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK
"〞": "U+301E", # DOUBLE PRIME QUOTATION MARK
"〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK
""": "U+FF02", # FULLWIDTH QUOTATION MARK
",,": "U+275E", # LOW HEAVY DOUBLE COMMA ORNAMENT
}

_SINGLE_QUOTES = {
"'": "U+0027", # noqa 601 # Standard typewriter/programmer's quote
"'": "U+2018", # noqa 601 # Left single quotation mark
"'": "U+2019", # noqa 601 # Right single quotation mark # noqa: W605
"‚": "U+201A", # Single low-9 quotation mark
"‛": "U+201B", # Single high-reversed-9 quotation mark
"‹": "U+2039", # Single left-pointing angle quotation mark
"›": "U+203A", # Single right-pointing angle quotation mark
"❛": "U+275B", # Heavy single turned comma quotation mark ornament
"❜": "U+275C", # Heavy single comma quotation mark ornament
"「": "U+300C", # Left corner bracket
"」": "U+300D", # Right corner bracket
"『": "U+300E", # Left white corner bracket
"』": "U+300F", # Right white corner bracket
"﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
"﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
"﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
"﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
"'": "U+FF07", # FULLWIDTH APOSTROPHE
"「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET
"」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET
}

_TRANSLATION_TABLE = str.maketrans(
{chr(int(v.replace("U+", ""), 16)): '"' for v in _DOUBLE_QUOTES.values()}
| {chr(int(v.replace("U+", ""), 16)): "'" for v in _SINGLE_QUOTES.values()}
)


def calculate_accuracy(
output: Optional[str],
Expand Down Expand Up @@ -172,80 +223,6 @@ def standardize_quotes(text: str) -> str:
Returns:
str: The text with standardized quotes.
"""
# Double Quotes Dictionary
double_quotes = {
'"': "U+0022", # noqa 601 # Standard typewriter/programmer's quote
'"': "U+201C", # noqa 601 # Left double quotation mark
'"': "U+201D", # noqa 601 # Right double quotation mark
"„": "U+201E", # Double low-9 quotation mark
"‟": "U+201F", # Double high-reversed-9 quotation mark
"«": "U+00AB", # Left-pointing double angle quotation mark
"»": "U+00BB", # Right-pointing double angle quotation mark
"❝": "U+275D", # Heavy double turned comma quotation mark ornament
"❞": "U+275E", # Heavy double comma quotation mark ornament
"⹂": "U+2E42", # Double low-reversed-9 quotation mark
"🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
"🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
"🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
"⠦": "U+2826", # Braille double closing quotation mark
"⠴": "U+2834", # Braille double opening quotation mark
"〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK
"〞": "U+301E", # DOUBLE PRIME QUOTATION MARK
"〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK
""": "U+FF02", # FULLWIDTH QUOTATION MARK
",,": "U+275E", # LOW HEAVY DOUBLE COMMA ORNAMENT
}

# Single Quotes Dictionary
single_quotes = {
"'": "U+0027", # noqa 601 # Standard typewriter/programmer's quote
"'": "U+2018", # noqa 601 # Left single quotation mark
"'": "U+2019", # noqa 601 # Right single quotation mark # noqa: W605
"‚": "U+201A", # Single low-9 quotation mark
"‛": "U+201B", # Single high-reversed-9 quotation mark
"‹": "U+2039", # Single left-pointing angle quotation mark
"›": "U+203A", # Single right-pointing angle quotation mark
"❛": "U+275B", # Heavy single turned comma quotation mark ornament
"❜": "U+275C", # Heavy single comma quotation mark ornament
"「": "U+300C", # Left corner bracket
"」": "U+300D", # Right corner bracket
"『": "U+300E", # Left white corner bracket
"』": "U+300F", # Right white corner bracket
"﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
"﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
"﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
"﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
"'": "U+FF07", # FULLWIDTH APOSTROPHE
"「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET
"」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET
}

double_quote_standard = '"'
single_quote_standard = "'"

# Apply double quote replacements
for unicode_val in double_quotes.values():
unicode_char = unicode_to_char(unicode_val)
if unicode_char in text:
text = text.replace(unicode_char, double_quote_standard)

# Apply single quote replacements
for unicode_val in single_quotes.values():
unicode_char = unicode_to_char(unicode_val)
if unicode_char in text:
text = text.replace(unicode_char, single_quote_standard)

return text


def unicode_to_char(unicode_val: str) -> str:
"""
Converts a Unicode value to a character.
return text.translate(_TRANSLATION_TABLE)

Args:
unicode_val (str): The Unicode value to convert.

Returns:
str: The character corresponding to the Unicode value.
"""
return chr(int(unicode_val.replace("U+", ""), 16))