|
| 1 | +// Copyright Mondoo, Inc. 2024, 2026 |
| 2 | +// SPDX-License-Identifier: BUSL-1.1 |
| 3 | + |
| 4 | +package classifier |
| 5 | + |
| 6 | +import ( |
| 7 | + "errors" |
| 8 | + "fmt" |
| 9 | + "unicode" |
| 10 | + "unicode/utf8" |
| 11 | +) |
| 12 | + |
| 13 | +// CategoryDescriptions maps Unicode general category codes to their descriptions |
| 14 | +var CategoryDescriptions = map[string]string{ |
| 15 | + // Letter categories |
| 16 | + "Lu": "Letter, Uppercase", |
| 17 | + "Ll": "Letter, Lowercase", |
| 18 | + "Lt": "Letter, Titlecase", |
| 19 | + "Lm": "Letter, Modifier", |
| 20 | + "Lo": "Letter, Other", |
| 21 | + |
| 22 | + // Mark categories |
| 23 | + "Mn": "Mark, Nonspacing", |
| 24 | + "Mc": "Mark, Spacing Combining", |
| 25 | + "Me": "Mark, Enclosing", |
| 26 | + |
| 27 | + // Number categories |
| 28 | + "Nd": "Number, Decimal Digit", |
| 29 | + "Nl": "Number, Letter", |
| 30 | + "No": "Number, Other", |
| 31 | + |
| 32 | + // Punctuation categories |
| 33 | + "Pc": "Punctuation, Connector", |
| 34 | + "Pd": "Punctuation, Dash", |
| 35 | + "Ps": "Punctuation, Open", |
| 36 | + "Pe": "Punctuation, Close", |
| 37 | + "Pi": "Punctuation, Initial quote", |
| 38 | + "Pf": "Punctuation, Final quote", |
| 39 | + "Po": "Punctuation, Other", |
| 40 | + |
| 41 | + // Symbol categories |
| 42 | + "Sm": "Symbol, Math", |
| 43 | + "Sc": "Symbol, Currency", |
| 44 | + "Sk": "Symbol, Modifier", |
| 45 | + "So": "Symbol, Other", |
| 46 | + |
| 47 | + // Separator categories |
| 48 | + "Zs": "Separator, Space", |
| 49 | + "Zl": "Separator, Line", |
| 50 | + "Zp": "Separator, Paragraph", |
| 51 | + |
| 52 | + // Control categories |
| 53 | + "Cc": "Control", |
| 54 | + "Cf": "Format", |
| 55 | + "Cs": "Surrogate", |
| 56 | + "Co": "Private Use", |
| 57 | + "Cn": "Unassigned", |
| 58 | +} |
| 59 | + |
| 60 | +// CharacterInfo represents detailed information about a Unicode character |
| 61 | +type CharacterInfo struct { |
| 62 | + // Position is the zero-based index of the character in the original string |
| 63 | + Position int `json:"position"` |
| 64 | + |
| 65 | + // Character is the actual Unicode character as a string |
| 66 | + Character string `json:"character"` |
| 67 | + |
| 68 | + // UnicodePoint is the Unicode code point in U+XXXX format |
| 69 | + UnicodePoint string `json:"unicodePoint"` |
| 70 | + |
| 71 | + // MajorCategory is the major Unicode category (single letter) |
| 72 | + MajorCategory string `json:"majorCategory"` |
| 73 | + |
| 74 | + // Category is the Unicode general category code (two-letter code) |
| 75 | + Category string `json:"category"` |
| 76 | + |
| 77 | + // Description is the human-readable description of the category |
| 78 | + Description string `json:"description"` |
| 79 | + |
| 80 | + // Rune is the raw Go rune (int32) value of the character |
| 81 | + Rune rune `json:"rune"` |
| 82 | +} |
| 83 | + |
| 84 | +// UnicodeClassifier provides Unicode character classification functionality |
| 85 | +type UnicodeClassifier struct{} |
| 86 | + |
| 87 | +// NewUnicodeClassifier creates a new Unicode classifier instance |
| 88 | +func NewUnicodeClassifier() *UnicodeClassifier { |
| 89 | + return &UnicodeClassifier{} |
| 90 | +} |
| 91 | + |
| 92 | +// validateUTF8 is a helper function to validate UTF-8 strings |
| 93 | +func validateUTF8(text string) error { |
| 94 | + if !utf8.ValidString(text) { |
| 95 | + return errors.New("invalid UTF-8 string") |
| 96 | + } |
| 97 | + return nil |
| 98 | +} |
| 99 | + |
| 100 | +// formatUnicodePoint efficiently formats a rune as a Unicode code point |
| 101 | +func formatUnicodePoint(r rune) string { |
| 102 | + return fmt.Sprintf("U+%04X", r) |
| 103 | +} |
| 104 | + |
| 105 | +// getUnicodeCategory determines the Unicode general category for a rune |
| 106 | +// see https://en.wikipedia.org/wiki/Unicode_character_property#General_Category |
| 107 | +func (c *UnicodeClassifier) getUnicodeCategory(r rune) string { |
| 108 | + switch { |
| 109 | + case unicode.IsUpper(r): |
| 110 | + return "Lu" |
| 111 | + case unicode.IsLower(r): |
| 112 | + return "Ll" |
| 113 | + case unicode.IsTitle(r): |
| 114 | + return "Lt" |
| 115 | + case unicode.In(r, unicode.Lm): |
| 116 | + return "Lm" |
| 117 | + case unicode.IsLetter(r): |
| 118 | + return "Lo" |
| 119 | + case unicode.In(r, unicode.Mn): |
| 120 | + return "Mn" |
| 121 | + case unicode.In(r, unicode.Mc): |
| 122 | + return "Mc" |
| 123 | + case unicode.In(r, unicode.Me): |
| 124 | + return "Me" |
| 125 | + case unicode.IsDigit(r): |
| 126 | + return "Nd" |
| 127 | + case unicode.In(r, unicode.Nl): |
| 128 | + return "Nl" |
| 129 | + case unicode.IsNumber(r): |
| 130 | + return "No" |
| 131 | + case unicode.In(r, unicode.Pc): |
| 132 | + return "Pc" |
| 133 | + case unicode.In(r, unicode.Pd): |
| 134 | + return "Pd" |
| 135 | + case unicode.In(r, unicode.Ps): |
| 136 | + return "Ps" |
| 137 | + case unicode.In(r, unicode.Pe): |
| 138 | + return "Pe" |
| 139 | + case unicode.In(r, unicode.Pi): |
| 140 | + return "Pi" |
| 141 | + case unicode.In(r, unicode.Pf): |
| 142 | + return "Pf" |
| 143 | + case unicode.IsPunct(r): |
| 144 | + return "Po" |
| 145 | + case unicode.In(r, unicode.Sm): |
| 146 | + return "Sm" |
| 147 | + case unicode.In(r, unicode.Sc): |
| 148 | + return "Sc" |
| 149 | + case unicode.In(r, unicode.Sk): |
| 150 | + return "Sk" |
| 151 | + case unicode.IsSymbol(r): |
| 152 | + return "So" |
| 153 | + case unicode.In(r, unicode.Zs): |
| 154 | + return "Zs" |
| 155 | + case unicode.In(r, unicode.Zl): |
| 156 | + return "Zl" |
| 157 | + case unicode.In(r, unicode.Zp): |
| 158 | + return "Zp" |
| 159 | + case unicode.IsControl(r): |
| 160 | + return "Cc" |
| 161 | + case unicode.In(r, unicode.Cf): |
| 162 | + return "Cf" |
| 163 | + case unicode.In(r, unicode.Cs): |
| 164 | + return "Cs" |
| 165 | + case unicode.In(r, unicode.Co): |
| 166 | + return "Co" |
| 167 | + default: |
| 168 | + return "Cn" |
| 169 | + } |
| 170 | +} |
| 171 | + |
| 172 | +// ClassifyRune classifies a single rune and returns its category and description |
| 173 | +func (c *UnicodeClassifier) ClassifyRune(r rune) (category, description string) { |
| 174 | + category = c.getUnicodeCategory(r) |
| 175 | + if desc, exists := CategoryDescriptions[category]; exists { |
| 176 | + description = desc |
| 177 | + } else { |
| 178 | + description = "Unknown category" |
| 179 | + } |
| 180 | + return |
| 181 | +} |
| 182 | + |
| 183 | +// ClassifyString analyzes all characters in a string and returns detailed information |
| 184 | +func (c *UnicodeClassifier) ClassifyString(text string) ([]CharacterInfo, error) { |
| 185 | + if err := validateUTF8(text); err != nil { |
| 186 | + return nil, err |
| 187 | + } |
| 188 | + |
| 189 | + runeCount := utf8.RuneCountInString(text) |
| 190 | + results := make([]CharacterInfo, 0, runeCount) |
| 191 | + |
| 192 | + position := 0 |
| 193 | + for _, r := range text { |
| 194 | + category, description := c.ClassifyRune(r) |
| 195 | + |
| 196 | + info := CharacterInfo{ |
| 197 | + Position: position, |
| 198 | + Character: string(r), |
| 199 | + UnicodePoint: formatUnicodePoint(r), |
| 200 | + MajorCategory: string(category[0]), |
| 201 | + Category: category, |
| 202 | + Description: description, |
| 203 | + Rune: r, |
| 204 | + } |
| 205 | + |
| 206 | + results = append(results, info) |
| 207 | + position++ |
| 208 | + } |
| 209 | + |
| 210 | + return results, nil |
| 211 | +} |
| 212 | + |
| 213 | +// GetCategorySummary returns a count of each Unicode category in the text |
| 214 | +func (c *UnicodeClassifier) GetCategorySummary(text string) (map[string]int, error) { |
| 215 | + if err := validateUTF8(text); err != nil { |
| 216 | + return nil, err |
| 217 | + } |
| 218 | + |
| 219 | + categoryCounts := make(map[string]int) |
| 220 | + |
| 221 | + for _, r := range text { |
| 222 | + category := c.getUnicodeCategory(r) |
| 223 | + categoryCounts[category]++ |
| 224 | + } |
| 225 | + |
| 226 | + return categoryCounts, nil |
| 227 | +} |
0 commit comments