Add strcase package

lavoiesl · lavoiesl · commit 72f07d16d344 · 2021-01-26T13:34:52.000-05:00
- `ToPascalCase`, `ToCamelCase`, and `ToSnakeCase` will transform any
  input to that form.
- Support for unicode runes
- Support for all-uppercase initialisms, like mandated by the Go convention.
- Expose `IsInitialism`
- Emphasis on reducing allocations for memory efficiency.
diff --git a/strcase/id.go b/strcase/id.go
@@ -0,0 +1,120 @@
+package strcase
+
+import (
+	"math"
+	"strings"
+	"unicode"
+)
+
+func ToPascalCase(input string) string {
+	return splitJoin(input, 0, 0)
+}
+
+func ToCamelCase(input string) string {
+	return splitJoin(input, 1, 0)
+}
+
+func ToSnakeCase(input string) string {
+	return splitJoin(input, math.MaxInt64, '_')
+}
+
+func allocateBuilder(input string, separator rune) *strings.Builder {
+	var b strings.Builder
+	length := len(input)
+	if separator != 0 {
+		// Heuristic to add about 25% buffer for separators
+		// Not having perfect match isn't terrible, it will only result in a few more memory allocations.
+		// Ex:
+		//   foo_bar_baz: 9 original chars, 11 final. 9 * 5 / 4 = 11
+		//   foo_id: 5 original chars, 6 final. 5 * 5 / 4 = 6
+		//   a_b_c_d: 4 original chars, 7 final. 4 * 5 / 4 = 5, which will result in an extra allocation.
+		length = length * 5 / 4
+	}
+
+	b.Grow(length)
+	return &b
+}
+
+func splitJoin(input string, firstUpper int, separator rune) string {
+	b := allocateBuilder(input, separator)
+	var buf []rune
+	var currentPartIndex int
+	var lastCategory runeCategory
+
+	// Flush the buffer as a part
+	flush := func() {
+		if len(buf) == 0 {
+			// Nothing was added since last flush
+			return
+		}
+		if separator != 0 && currentPartIndex > 0 {
+			b.WriteRune(separator)
+		}
+		if currentPartIndex >= firstUpper {
+			pascalPart(buf)
+		}
+		for _, r := range buf {
+			b.WriteRune(r)
+		}
+		currentPartIndex++
+		lastCategory = unknown
+		buf = buf[0:0] // Clear buffer, but keep current allocation
+	}
+
+	for _, r := range input {
+		switch cat := category(r); cat {
+		case upper:
+			if lastCategory != upper {
+				flush()
+			}
+			lastCategory = cat
+			buf = append(buf, unicode.ToLower(r))
+		case lower, number:
+			if (lastCategory > number) != (cat > number) {
+				flush()
+			}
+			lastCategory = cat
+			buf = append(buf, r)
+		default:
+			// separator
+			flush()
+		}
+	}
+	flush()
+
+	return b.String()
+}
+
+// Convert to uppercase if initialism.
+// Convert first rune to uppercase otherwise.
+func pascalPart(part []rune) {
+	if isInitialism(part) {
+		for ri, r := range part {
+			part[ri] = unicode.ToUpper(r)
+		}
+	} else {
+		part[0] = unicode.ToUpper(part[0])
+	}
+}
+
+type runeCategory int
+
+const (
+	unknown runeCategory = iota
+	number
+	lower
+	upper
+)
+
+func category(r rune) runeCategory {
+	switch {
+	case unicode.IsLower(r):
+		return lower
+	case unicode.IsUpper(r):
+		return upper
+	case unicode.IsNumber(r):
+		return number
+	default:
+		return unknown
+	}
+}
diff --git a/strcase/id_test.go b/strcase/id_test.go
@@ -0,0 +1,191 @@
+package strcase
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+// splitjoin_l1_p1         	   38.1 ns/op	     16 B/op	  1 allocs/op
+// IDToCamelCase_l1_p1     	   88.6 ns/op	     48 B/op	  3 allocs/op
+// IDToSnakeCase_l1_p1     	   87.7 ns/op	     48 B/op	  3 allocs/op
+//
+// splitjoin_l1_p10        	    253 ns/op	    176 B/op	  2 allocs/op
+// IDToCamelCase_l1_p10    	    421 ns/op	     72 B/op	  3 allocs/op
+// IDToSnakeCase_l1_p10    	    269 ns/op	     72 B/op	  3 allocs/op
+//
+// splitjoin_l1_p100       	   2137 ns/op	   1904 B/op	  2 allocs/op
+// IDToCamelCase_l1_p100   	   3503 ns/op	    248 B/op	  3 allocs/op
+// IDToSnakeCase_l1_p100   	   1879 ns/op	    296 B/op	  3 allocs/op
+//
+// splitjoin_l10_p1        	   38.0 ns/op	     16 B/op	  1 allocs/op
+// IDToCamelCase_l10_p1    	    247 ns/op	    168 B/op	  6 allocs/op
+// IDToSnakeCase_l10_p1    	    248 ns/op	    168 B/op	  6 allocs/op
+//
+// splitjoin_l10_p10       	    278 ns/op	    272 B/op	  2 allocs/op
+// IDToCamelCase_l10_p10   	   1140 ns/op	    264 B/op	  6 allocs/op
+// IDToSnakeCase_l10_p10   	    979 ns/op	    296 B/op	  6 allocs/op
+//
+// splitjoin_l10_p100      	   2267 ns/op	   2816 B/op	  2 allocs/op
+// IDToCamelCase_l10_p100  	   9538 ns/op	   1304 B/op	  6 allocs/op
+// IDToSnakeCase_l10_p100  	   8147 ns/op	   1560 B/op	  6 allocs/op
+//
+// splitjoin_l100_p1       	   41.1 ns/op	     16 B/op	  1 allocs/op
+// IDToCamelCase_l100_p1   	   1114 ns/op	   1160 B/op	  9 allocs/op
+// IDToSnakeCase_l100_p1   	   1104 ns/op	   1176 B/op	  9 allocs/op
+//
+// splitjoin_l100_p10      	    446 ns/op	   1184 B/op	  2 allocs/op
+// IDToCamelCase_l100_p10  	   7692 ns/op	   2072 B/op	  9 allocs/op
+// IDToSnakeCase_l100_p10  	   7589 ns/op	   2328 B/op	  9 allocs/op
+//
+// splitjoin_l100_p100     	   3877 ns/op	  12032 B/op	  2 allocs/op
+// IDToCamelCase_l100_p100 	  72671 ns/op	  11288 B/op	  9 allocs/op
+// IDToSnakeCase_l100_p100 	  71673 ns/op	  14616 B/op	  9 allocs/op
+func Benchmark_splitJoin(b *testing.B) {
+	for _, length := range []int{1, 10, 100} {
+		part := strings.Repeat("a", length)
+
+		for _, count := range []int{1, 10, 100} {
+			input := part + strings.Repeat("_"+part, count-1)
+
+			// Baseline, split and join all parts
+			b.Run(fmt.Sprintf("splitjoin_l%d_p%d", length, count), func(b *testing.B) {
+				for i := 0; i < b.N; i++ {
+					strings.Join(strings.Split(input, "_"), "")
+				}
+			})
+
+			b.Run(fmt.Sprintf("IDToCamelCase_l%d_p%d", length, count), func(b *testing.B) {
+				for i := 0; i < b.N; i++ {
+					ToCamelCase(input)
+				}
+			})
+
+			b.Run(fmt.Sprintf("IDToSnakeCase_l%d_p%d", length, count), func(b *testing.B) {
+				for i := 0; i < b.N; i++ {
+					ToSnakeCase(input)
+				}
+			})
+		}
+	}
+}
+
+// lower	       5.03 ns/op	       0 B/op	       0 allocs/op
+// upper	       5.81 ns/op	       0 B/op	       0 allocs/op
+// number	       6.59 ns/op	       0 B/op	       0 allocs/op
+// symbol	       6.58 ns/op	       0 B/op	       0 allocs/op
+// 16_bits	       153 ns/op	       0 B/op	       0 allocs/op
+// 32_bits	       160 ns/op	       0 B/op	       0 allocs/op
+func Benchmark_category(b *testing.B) {
+	tests := map[string][]rune{
+		"lower":   {'a', 'b'},
+		"upper":   {'A', 'B'},
+		"number":  {'0', '1'},
+		"symbol":  {'_', ' '},
+		"16 bits": {'™', '∞', '•', 'Ω'},
+		"32 bits": {'𠁂', '𠁄', '𠁔', '𠁑'},
+	}
+	for name, runes := range tests {
+		b.Run(name, func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				for _, r := range runes {
+					category(r)
+				}
+			}
+		})
+	}
+}
+
+func Test_splitJoin(t *testing.T) {
+	tests := []struct {
+		input  string
+		camel  string
+		pascal string
+		snake  string
+	}{
+		{
+			// everything empty
+		},
+		{
+			input:  "a",
+			pascal: "A",
+			camel:  "a",
+			snake:  "a",
+		},
+		{
+			input:  "A",
+			pascal: "A",
+			camel:  "a",
+			snake:  "a",
+		},
+		{
+			input:  "a_a",
+			pascal: "AA",
+			camel:  "aA",
+			snake:  "a_a",
+		},
+		{
+			input:  "__a___a_",
+			pascal: "AA",
+			camel:  "aA",
+			snake:  "a_a",
+		},
+		{
+			input:  "aa_bbb",
+			pascal: "AaBbb",
+			camel:  "aaBbb",
+			snake:  "aa_bbb",
+		},
+		{
+			input:  "aa_id",
+			pascal: "AaID",
+			camel:  "aaID",
+			snake:  "aa_id",
+		},
+		{
+			input:  "fooBar",
+			pascal: "FooBar",
+			camel:  "fooBar",
+			snake:  "foo_bar",
+		},
+		{
+			input:  "FooBAR",
+			pascal: "FooBar",
+			camel:  "fooBar",
+			snake:  "foo_bar",
+		},
+		{
+			input:  "fooUrl",
+			pascal: "FooURL",
+			camel:  "fooURL",
+			snake:  "foo_url",
+		},
+		{
+			input:  "fooURL",
+			pascal: "FooURL",
+			camel:  "fooURL",
+			snake:  "foo_url",
+		},
+		{
+			input:  "url10",
+			pascal: "URL10",
+			camel:  "url10",
+			snake:  "url_10",
+		},
+		{
+			input:  "url_id",
+			pascal: "URLID",
+			camel:  "urlID",
+			snake:  "url_id",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.input, func(t *testing.T) {
+			require.Equal(t, tt.pascal, ToPascalCase(tt.input))
+			require.Equal(t, tt.camel, ToCamelCase(tt.input))
+			require.Equal(t, tt.snake, ToSnakeCase(tt.input))
+		})
+	}
+}
diff --git a/strcase/initialism.go b/strcase/initialism.go
@@ -0,0 +1,83 @@
+package strcase
+
+import "sort"
+
+var commonInitialisms [][]rune
+
+func init() {
+	// To follow go's convention of have acronyms in all caps, hard code a few of the common ones
+	// Taken from https://github.com/golang/lint/blob/83fdc39ff7b56453e3793356bcff3070b9b96445/lint.go#L770-L809
+	var initialisms = []string{
+		"acl",
+		"api",
+		"ascii",
+		"cpu",
+		"css",
+		"dns",
+		"eof",
+		"guid",
+		"html",
+		"http",
+		"https",
+		"id",
+		"ip",
+		"json",
+		"lhs",
+		"qps",
+		"ram",
+		"rhs",
+		"rpc",
+		"sla",
+		"smtp",
+		"sql",
+		"ssh",
+		"tcp",
+		"tls",
+		"ttl",
+		"udp",
+		"ui",
+		"uid",
+		"uuid",
+		"uri",
+		"url",
+		"utf8",
+		"vm",
+		"xml",
+		"xmpp",
+		"xsrf",
+		"xss",
+	}
+	sort.Strings(initialisms)
+
+	for _, initialism := range initialisms {
+		commonInitialisms = append(commonInitialisms, []rune(initialism))
+	}
+}
+
+func IsInitialism(part string) bool {
+	return isInitialism([]rune(part))
+}
+
+func isInitialism(part []rune) bool {
+	// Adapted from sort.Search to benefit from the fact that we only deal with rune slices
+	i := 0
+	j := len(commonInitialisms)
+out:
+	for i < j {
+		h := int(uint(i+j) >> 1) // avoid overflow when computing h
+		// i ≤ h < j
+
+		for k, r := range commonInitialisms[h] {
+			switch {
+			case len(part) < k+1 || part[k] < r:
+				j = h
+				continue out
+			case part[k] > r:
+				i = h + 1
+				continue out
+			}
+		}
+		return true
+	}
+	return false
+}
diff --git a/strcase/initialism_test.go b/strcase/initialism_test.go