Skip to content

Commit d4ec9e1

Browse files
authored
⭐ add unicode resources to core (#7157)
* ⭐ add unicode resources to core * 🟢 feedback
1 parent a60836a commit d4ec9e1

File tree

7 files changed

+895
-1
lines changed

7 files changed

+895
-1
lines changed
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
// Copyright Mondoo, Inc. 2024, 2026
2+
// SPDX-License-Identifier: BUSL-1.1
3+
4+
package classifier
5+
6+
import (
7+
"errors"
8+
"fmt"
9+
"unicode"
10+
"unicode/utf8"
11+
)
12+
13+
// CategoryDescriptions maps Unicode general category codes to their descriptions
14+
var CategoryDescriptions = map[string]string{
15+
// Letter categories
16+
"Lu": "Letter, Uppercase",
17+
"Ll": "Letter, Lowercase",
18+
"Lt": "Letter, Titlecase",
19+
"Lm": "Letter, Modifier",
20+
"Lo": "Letter, Other",
21+
22+
// Mark categories
23+
"Mn": "Mark, Nonspacing",
24+
"Mc": "Mark, Spacing Combining",
25+
"Me": "Mark, Enclosing",
26+
27+
// Number categories
28+
"Nd": "Number, Decimal Digit",
29+
"Nl": "Number, Letter",
30+
"No": "Number, Other",
31+
32+
// Punctuation categories
33+
"Pc": "Punctuation, Connector",
34+
"Pd": "Punctuation, Dash",
35+
"Ps": "Punctuation, Open",
36+
"Pe": "Punctuation, Close",
37+
"Pi": "Punctuation, Initial quote",
38+
"Pf": "Punctuation, Final quote",
39+
"Po": "Punctuation, Other",
40+
41+
// Symbol categories
42+
"Sm": "Symbol, Math",
43+
"Sc": "Symbol, Currency",
44+
"Sk": "Symbol, Modifier",
45+
"So": "Symbol, Other",
46+
47+
// Separator categories
48+
"Zs": "Separator, Space",
49+
"Zl": "Separator, Line",
50+
"Zp": "Separator, Paragraph",
51+
52+
// Control categories
53+
"Cc": "Control",
54+
"Cf": "Format",
55+
"Cs": "Surrogate",
56+
"Co": "Private Use",
57+
"Cn": "Unassigned",
58+
}
59+
60+
// CharacterInfo represents detailed information about a Unicode character
61+
type CharacterInfo struct {
62+
// Position is the zero-based index of the character in the original string
63+
Position int `json:"position"`
64+
65+
// Character is the actual Unicode character as a string
66+
Character string `json:"character"`
67+
68+
// UnicodePoint is the Unicode code point in U+XXXX format
69+
UnicodePoint string `json:"unicodePoint"`
70+
71+
// MajorCategory is the major Unicode category (single letter)
72+
MajorCategory string `json:"majorCategory"`
73+
74+
// Category is the Unicode general category code (two-letter code)
75+
Category string `json:"category"`
76+
77+
// Description is the human-readable description of the category
78+
Description string `json:"description"`
79+
80+
// Rune is the raw Go rune (int32) value of the character
81+
Rune rune `json:"rune"`
82+
}
83+
84+
// UnicodeClassifier provides Unicode character classification functionality
85+
type UnicodeClassifier struct{}
86+
87+
// NewUnicodeClassifier creates a new Unicode classifier instance
88+
func NewUnicodeClassifier() *UnicodeClassifier {
89+
return &UnicodeClassifier{}
90+
}
91+
92+
// validateUTF8 is a helper function to validate UTF-8 strings
93+
func validateUTF8(text string) error {
94+
if !utf8.ValidString(text) {
95+
return errors.New("invalid UTF-8 string")
96+
}
97+
return nil
98+
}
99+
100+
// formatUnicodePoint efficiently formats a rune as a Unicode code point
101+
func formatUnicodePoint(r rune) string {
102+
return fmt.Sprintf("U+%04X", r)
103+
}
104+
105+
// getUnicodeCategory determines the Unicode general category for a rune
106+
// see https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
107+
func (c *UnicodeClassifier) getUnicodeCategory(r rune) string {
108+
switch {
109+
case unicode.IsUpper(r):
110+
return "Lu"
111+
case unicode.IsLower(r):
112+
return "Ll"
113+
case unicode.IsTitle(r):
114+
return "Lt"
115+
case unicode.In(r, unicode.Lm):
116+
return "Lm"
117+
case unicode.IsLetter(r):
118+
return "Lo"
119+
case unicode.In(r, unicode.Mn):
120+
return "Mn"
121+
case unicode.In(r, unicode.Mc):
122+
return "Mc"
123+
case unicode.In(r, unicode.Me):
124+
return "Me"
125+
case unicode.IsDigit(r):
126+
return "Nd"
127+
case unicode.In(r, unicode.Nl):
128+
return "Nl"
129+
case unicode.IsNumber(r):
130+
return "No"
131+
case unicode.In(r, unicode.Pc):
132+
return "Pc"
133+
case unicode.In(r, unicode.Pd):
134+
return "Pd"
135+
case unicode.In(r, unicode.Ps):
136+
return "Ps"
137+
case unicode.In(r, unicode.Pe):
138+
return "Pe"
139+
case unicode.In(r, unicode.Pi):
140+
return "Pi"
141+
case unicode.In(r, unicode.Pf):
142+
return "Pf"
143+
case unicode.IsPunct(r):
144+
return "Po"
145+
case unicode.In(r, unicode.Sm):
146+
return "Sm"
147+
case unicode.In(r, unicode.Sc):
148+
return "Sc"
149+
case unicode.In(r, unicode.Sk):
150+
return "Sk"
151+
case unicode.IsSymbol(r):
152+
return "So"
153+
case unicode.In(r, unicode.Zs):
154+
return "Zs"
155+
case unicode.In(r, unicode.Zl):
156+
return "Zl"
157+
case unicode.In(r, unicode.Zp):
158+
return "Zp"
159+
case unicode.IsControl(r):
160+
return "Cc"
161+
case unicode.In(r, unicode.Cf):
162+
return "Cf"
163+
case unicode.In(r, unicode.Cs):
164+
return "Cs"
165+
case unicode.In(r, unicode.Co):
166+
return "Co"
167+
default:
168+
return "Cn"
169+
}
170+
}
171+
172+
// ClassifyRune classifies a single rune and returns its category and description
173+
func (c *UnicodeClassifier) ClassifyRune(r rune) (category, description string) {
174+
category = c.getUnicodeCategory(r)
175+
if desc, exists := CategoryDescriptions[category]; exists {
176+
description = desc
177+
} else {
178+
description = "Unknown category"
179+
}
180+
return
181+
}
182+
183+
// ClassifyString analyzes all characters in a string and returns detailed information
184+
func (c *UnicodeClassifier) ClassifyString(text string) ([]CharacterInfo, error) {
185+
if err := validateUTF8(text); err != nil {
186+
return nil, err
187+
}
188+
189+
runeCount := utf8.RuneCountInString(text)
190+
results := make([]CharacterInfo, 0, runeCount)
191+
192+
position := 0
193+
for _, r := range text {
194+
category, description := c.ClassifyRune(r)
195+
196+
info := CharacterInfo{
197+
Position: position,
198+
Character: string(r),
199+
UnicodePoint: formatUnicodePoint(r),
200+
MajorCategory: string(category[0]),
201+
Category: category,
202+
Description: description,
203+
Rune: r,
204+
}
205+
206+
results = append(results, info)
207+
position++
208+
}
209+
210+
return results, nil
211+
}
212+
213+
// GetCategorySummary returns a count of each Unicode category in the text
214+
func (c *UnicodeClassifier) GetCategorySummary(text string) (map[string]int, error) {
215+
if err := validateUTF8(text); err != nil {
216+
return nil, err
217+
}
218+
219+
categoryCounts := make(map[string]int)
220+
221+
for _, r := range text {
222+
category := c.getUnicodeCategory(r)
223+
categoryCounts[category]++
224+
}
225+
226+
return categoryCounts, nil
227+
}

0 commit comments

Comments
 (0)