golang/src/unicode/letter_test.go

645 lines
15 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode_test
import (
"flag"
"fmt"
"runtime"
"sort"
"strings"
"testing"
. "unicode"
)
var upperTest = []rune{
0x41,
0xc0,
0xd8,
0x100,
0x139,
0x14a,
0x178,
0x181,
0x376,
0x3cf,
0x13bd,
0x1f2a,
0x2102,
0x2c00,
0x2c10,
0x2c20,
0xa650,
0xa722,
0xff3a,
0x10400,
0x1d400,
0x1d7ca,
}
var notupperTest = []rune{
0x40,
0x5b,
0x61,
0x185,
0x1b0,
0x377,
0x387,
0x2150,
0xab7d,
0xffff,
0x10000,
}
var letterTest = []rune{
0x41,
0x61,
0xaa,
0xba,
0xc8,
0xdb,
0xf9,
0x2ec,
0x535,
0x620,
0x6e6,
0x93d,
0xa15,
0xb99,
0xdc0,
0xedd,
0x1000,
0x1200,
0x1312,
0x1401,
0x2c00,
0xa800,
0xf900,
0xfa30,
0xffda,
0xffdc,
0x10000,
0x10300,
0x10400,
0x20000,
0x2f800,
0x2fa1d,
}
var notletterTest = []rune{
0x20,
0x35,
0x375,
0x619,
0x700,
0x1885,
0xfffe,
0x1ffff,
0x10ffff,
}
// Contains all the special cased Latin-1 chars.
var spaceTest = []rune{
0x09,
0x0a,
0x0b,
0x0c,
0x0d,
0x20,
0x85,
0xA0,
0x2000,
0x3000,
}
type caseT struct {
cas int
in, out rune
}
var caseTest = []caseT{
// errors
{-1, '\n', 0xFFFD},
{UpperCase, -1, -1},
{UpperCase, 1 << 30, 1 << 30},
// ASCII (special-cased so test carefully)
{UpperCase, '\n', '\n'},
{UpperCase, 'a', 'A'},
{UpperCase, 'A', 'A'},
{UpperCase, '7', '7'},
{LowerCase, '\n', '\n'},
{LowerCase, 'a', 'a'},
{LowerCase, 'A', 'a'},
{LowerCase, '7', '7'},
{TitleCase, '\n', '\n'},
{TitleCase, 'a', 'A'},
{TitleCase, 'A', 'A'},
{TitleCase, '7', '7'},
// Latin-1: easy to read the tests!
{UpperCase, 0x80, 0x80},
{UpperCase, 'Å', 'Å'},
{UpperCase, 'å', 'Å'},
{LowerCase, 0x80, 0x80},
{LowerCase, 'Å', 'å'},
{LowerCase, 'å', 'å'},
{TitleCase, 0x80, 0x80},
{TitleCase, 'Å', 'Å'},
{TitleCase, 'å', 'Å'},
// 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049
{UpperCase, 0x0131, 'I'},
{LowerCase, 0x0131, 0x0131},
{TitleCase, 0x0131, 'I'},
// 0133;LATIN SMALL LIGATURE IJ;Ll;0;L;<compat> 0069 006A;;;;N;LATIN SMALL LETTER I J;;0132;;0132
{UpperCase, 0x0133, 0x0132},
{LowerCase, 0x0133, 0x0133},
{TitleCase, 0x0133, 0x0132},
// 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B;
{UpperCase, 0x212A, 0x212A},
{LowerCase, 0x212A, 'k'},
{TitleCase, 0x212A, 0x212A},
// From an UpperLower sequence
// A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641;
{UpperCase, 0xA640, 0xA640},
{LowerCase, 0xA640, 0xA641},
{TitleCase, 0xA640, 0xA640},
// A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640
{UpperCase, 0xA641, 0xA640},
{LowerCase, 0xA641, 0xA641},
{TitleCase, 0xA641, 0xA640},
// A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F;
{UpperCase, 0xA64E, 0xA64E},
{LowerCase, 0xA64E, 0xA64F},
{TitleCase, 0xA64E, 0xA64E},
// A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E
{UpperCase, 0xA65F, 0xA65E},
{LowerCase, 0xA65F, 0xA65F},
{TitleCase, 0xA65F, 0xA65E},
// From another UpperLower sequence
// 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A;
{UpperCase, 0x0139, 0x0139},
{LowerCase, 0x0139, 0x013A},
{TitleCase, 0x0139, 0x0139},
// 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L;<compat> 004C 00B7;;;;N;;;;0140;
{UpperCase, 0x013f, 0x013f},
{LowerCase, 0x013f, 0x0140},
{TitleCase, 0x013f, 0x013f},
// 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N HACEK;;0147;;0147
{UpperCase, 0x0148, 0x0147},
{LowerCase, 0x0148, 0x0148},
{TitleCase, 0x0148, 0x0147},
// Lowercase lower than uppercase.
// AB78;CHEROKEE SMALL LETTER GE;Ll;0;L;;;;;N;;;13A8;;13A8
{UpperCase, 0xab78, 0x13a8},
{LowerCase, 0xab78, 0xab78},
{TitleCase, 0xab78, 0x13a8},
{UpperCase, 0x13a8, 0x13a8},
{LowerCase, 0x13a8, 0xab78},
{TitleCase, 0x13a8, 0x13a8},
// Last block in the 5.1.0 table
// 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428;
{UpperCase, 0x10400, 0x10400},
{LowerCase, 0x10400, 0x10428},
{TitleCase, 0x10400, 0x10400},
// 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F;
{UpperCase, 0x10427, 0x10427},
{LowerCase, 0x10427, 0x1044F},
{TitleCase, 0x10427, 0x10427},
// 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400
{UpperCase, 0x10428, 0x10400},
{LowerCase, 0x10428, 0x10428},
{TitleCase, 0x10428, 0x10400},
// 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427
{UpperCase, 0x1044F, 0x10427},
{LowerCase, 0x1044F, 0x1044F},
{TitleCase, 0x1044F, 0x10427},
// First one not in the 5.1.0 table
// 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;;
{UpperCase, 0x10450, 0x10450},
{LowerCase, 0x10450, 0x10450},
{TitleCase, 0x10450, 0x10450},
// Non-letters with case.
{LowerCase, 0x2161, 0x2171},
{UpperCase, 0x0345, 0x0399},
}
func TestIsLetter(t *testing.T) {
for _, r := range upperTest {
if !IsLetter(r) {
t.Errorf("IsLetter(U+%04X) = false, want true", r)
}
}
for _, r := range letterTest {
if !IsLetter(r) {
t.Errorf("IsLetter(U+%04X) = false, want true", r)
}
}
for _, r := range notletterTest {
if IsLetter(r) {
t.Errorf("IsLetter(U+%04X) = true, want false", r)
}
}
}
func TestIsUpper(t *testing.T) {
for _, r := range upperTest {
if !IsUpper(r) {
t.Errorf("IsUpper(U+%04X) = false, want true", r)
}
}
for _, r := range notupperTest {
if IsUpper(r) {
t.Errorf("IsUpper(U+%04X) = true, want false", r)
}
}
for _, r := range notletterTest {
if IsUpper(r) {
t.Errorf("IsUpper(U+%04X) = true, want false", r)
}
}
}
func caseString(c int) string {
switch c {
case UpperCase:
return "UpperCase"
case LowerCase:
return "LowerCase"
case TitleCase:
return "TitleCase"
}
return "ErrorCase"
}
func TestTo(t *testing.T) {
for _, c := range caseTest {
r := To(c.cas, c.in)
if c.out != r {
t.Errorf("To(U+%04X, %s) = U+%04X want U+%04X", c.in, caseString(c.cas), r, c.out)
}
}
}
func TestToUpperCase(t *testing.T) {
for _, c := range caseTest {
if c.cas != UpperCase {
continue
}
r := ToUpper(c.in)
if c.out != r {
t.Errorf("ToUpper(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
}
}
}
func TestToLowerCase(t *testing.T) {
for _, c := range caseTest {
if c.cas != LowerCase {
continue
}
r := ToLower(c.in)
if c.out != r {
t.Errorf("ToLower(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
}
}
}
func TestToTitleCase(t *testing.T) {
for _, c := range caseTest {
if c.cas != TitleCase {
continue
}
r := ToTitle(c.in)
if c.out != r {
t.Errorf("ToTitle(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
}
}
}
func TestIsSpace(t *testing.T) {
for _, c := range spaceTest {
if !IsSpace(c) {
t.Errorf("IsSpace(U+%04X) = false; want true", c)
}
}
for _, c := range letterTest {
if IsSpace(c) {
t.Errorf("IsSpace(U+%04X) = true; want false", c)
}
}
}
// Check that the optimizations for IsLetter etc. agree with the tables.
// We only need to check the Latin-1 range.
func TestLetterOptimizations(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
if Is(Letter, i) != IsLetter(i) {
t.Errorf("IsLetter(U+%04X) disagrees with Is(Letter)", i)
}
if Is(Upper, i) != IsUpper(i) {
t.Errorf("IsUpper(U+%04X) disagrees with Is(Upper)", i)
}
if Is(Lower, i) != IsLower(i) {
t.Errorf("IsLower(U+%04X) disagrees with Is(Lower)", i)
}
if Is(Title, i) != IsTitle(i) {
t.Errorf("IsTitle(U+%04X) disagrees with Is(Title)", i)
}
if Is(White_Space, i) != IsSpace(i) {
t.Errorf("IsSpace(U+%04X) disagrees with Is(White_Space)", i)
}
if To(UpperCase, i) != ToUpper(i) {
t.Errorf("ToUpper(U+%04X) disagrees with To(Upper)", i)
}
if To(LowerCase, i) != ToLower(i) {
t.Errorf("ToLower(U+%04X) disagrees with To(Lower)", i)
}
if To(TitleCase, i) != ToTitle(i) {
t.Errorf("ToTitle(U+%04X) disagrees with To(Title)", i)
}
}
}
func TestTurkishCase(t *testing.T) {
lower := []rune("abcçdefgğhıijklmnoöprsştuüvyz")
upper := []rune("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ")
for i, l := range lower {
u := upper[i]
if TurkishCase.ToLower(l) != l {
t.Errorf("lower(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToLower(l), l)
}
if TurkishCase.ToUpper(u) != u {
t.Errorf("upper(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToUpper(u), u)
}
if TurkishCase.ToUpper(l) != u {
t.Errorf("upper(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToUpper(l), u)
}
if TurkishCase.ToLower(u) != l {
t.Errorf("lower(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToLower(l), l)
}
if TurkishCase.ToTitle(u) != u {
t.Errorf("title(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToTitle(u), u)
}
if TurkishCase.ToTitle(l) != u {
t.Errorf("title(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToTitle(l), u)
}
}
}
var simpleFoldTests = []string{
// SimpleFold(x) returns the next equivalent rune > x or wraps
// around to smaller values.
// Easy cases.
"Aa",
"δΔ",
// ASCII special cases.
"Kk",
"Ssſ",
// Non-ASCII special cases.
"ρϱΡ",
"ͅΙιι",
// Extra special cases: has lower/upper but no case fold.
"İ",
"ı",
// Upper comes before lower (Cherokee).
"\u13b0\uab80",
}
func TestSimpleFold(t *testing.T) {
for _, tt := range simpleFoldTests {
cycle := []rune(tt)
r := cycle[len(cycle)-1]
for _, out := range cycle {
if r := SimpleFold(r); r != out {
t.Errorf("SimpleFold(%#U) = %#U, want %#U", r, r, out)
}
r = out
}
}
if r := SimpleFold(-42); r != -42 {
t.Errorf("SimpleFold(-42) = %v, want -42", r)
}
}
// Running 'go test -calibrate' runs the calibration to find a plausible
// cutoff point for linear search of a range list vs. binary search.
// We create a fake table and then time how long it takes to do a
// sequence of searches within that table, for all possible inputs
// relative to the ranges (something before all, in each, between each, after all).
// This assumes that all possible runes are equally likely.
// In practice most runes are ASCII so this is a conservative estimate
// of an effective cutoff value. In practice we could probably set it higher
// than what this function recommends.
var calibrate = flag.Bool("calibrate", false, "compute crossover for linear vs. binary search")
func TestCalibrate(t *testing.T) {
if !*calibrate {
return
}
if runtime.GOARCH == "amd64" {
fmt.Printf("warning: running calibration on %s\n", runtime.GOARCH)
}
// Find the point where binary search wins by more than 10%.
// The 10% bias gives linear search an edge when they're close,
// because on predominantly ASCII inputs linear search is even
// better than our benchmarks measure.
n := sort.Search(64, func(n int) bool {
tab := fakeTable(n)
blinear := func(b *testing.B) {
tab := tab
max := n*5 + 20
for i := 0; i < b.N; i++ {
for j := 0; j <= max; j++ {
linear(tab, uint16(j))
}
}
}
bbinary := func(b *testing.B) {
tab := tab
max := n*5 + 20
for i := 0; i < b.N; i++ {
for j := 0; j <= max; j++ {
binary(tab, uint16(j))
}
}
}
bmlinear := testing.Benchmark(blinear)
bmbinary := testing.Benchmark(bbinary)
fmt.Printf("n=%d: linear=%d binary=%d\n", n, bmlinear.NsPerOp(), bmbinary.NsPerOp())
return bmlinear.NsPerOp()*100 > bmbinary.NsPerOp()*110
})
fmt.Printf("calibration: linear cutoff = %d\n", n)
}
func fakeTable(n int) []Range16 {
var r16 []Range16
for i := 0; i < n; i++ {
r16 = append(r16, Range16{uint16(i*5 + 10), uint16(i*5 + 12), 1})
}
return r16
}
func linear(ranges []Range16, r uint16) bool {
for i := range ranges {
range_ := &ranges[i]
if r < range_.Lo {
return false
}
if r <= range_.Hi {
return (r-range_.Lo)%range_.Stride == 0
}
}
return false
}
func binary(ranges []Range16, r uint16) bool {
// binary search over ranges
lo := 0
hi := len(ranges)
for lo < hi {
m := int(uint(lo+hi) >> 1)
range_ := &ranges[m]
if range_.Lo <= r && r <= range_.Hi {
return (r-range_.Lo)%range_.Stride == 0
}
if r < range_.Lo {
hi = m
} else {
lo = m + 1
}
}
return false
}
func TestLatinOffset(t *testing.T) {
var maps = []map[string]*RangeTable{
Categories,
FoldCategory,
FoldScript,
Properties,
Scripts,
}
for _, m := range maps {
for name, tab := range m {
i := 0
for i < len(tab.R16) && tab.R16[i].Hi <= MaxLatin1 {
i++
}
if tab.LatinOffset != i {
t.Errorf("%s: LatinOffset=%d, want %d", name, tab.LatinOffset, i)
}
}
}
}
func TestSpecialCaseNoMapping(t *testing.T) {
// Issue 25636
// no change for rune 'A', zero delta, under upper/lower/title case change.
var noChangeForCapitalA = CaseRange{'A', 'A', [MaxCase]rune{0, 0, 0}}
got := strings.ToLowerSpecial(SpecialCase([]CaseRange{noChangeForCapitalA}), "ABC")
want := "Abc"
if got != want {
t.Errorf("got %q; want %q", got, want)
}
}
func TestNegativeRune(t *testing.T) {
// Issue 43254
// These tests cover negative rune handling by testing values which,
// when cast to uint8 or uint16, look like a particular valid rune.
// This package has Latin-1-specific optimizations, so we test all of
// Latin-1 and representative non-Latin-1 values in the character
// categories covered by IsGraphic, etc.
nonLatin1 := []uint32{
// Lu: LATIN CAPITAL LETTER A WITH MACRON
0x0100,
// Ll: LATIN SMALL LETTER A WITH MACRON
0x0101,
// Lt: LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
0x01C5,
// M: COMBINING GRAVE ACCENT
0x0300,
// Nd: ARABIC-INDIC DIGIT ZERO
0x0660,
// P: GREEK QUESTION MARK
0x037E,
// S: MODIFIER LETTER LEFT ARROWHEAD
0x02C2,
// Z: OGHAM SPACE MARK
0x1680,
}
for i := 0; i < MaxLatin1+len(nonLatin1); i++ {
base := uint32(i)
if i >= MaxLatin1 {
base = nonLatin1[i-MaxLatin1]
}
// Note r is negative, but uint8(r) == uint8(base) and
// uint16(r) == uint16(base).
r := rune(base - 1<<31)
if Is(Letter, r) {
t.Errorf("Is(Letter, 0x%x - 1<<31) = true, want false", base)
}
if IsControl(r) {
t.Errorf("IsControl(0x%x - 1<<31) = true, want false", base)
}
if IsDigit(r) {
t.Errorf("IsDigit(0x%x - 1<<31) = true, want false", base)
}
if IsGraphic(r) {
t.Errorf("IsGraphic(0x%x - 1<<31) = true, want false", base)
}
if IsLetter(r) {
t.Errorf("IsLetter(0x%x - 1<<31) = true, want false", base)
}
if IsLower(r) {
t.Errorf("IsLower(0x%x - 1<<31) = true, want false", base)
}
if IsMark(r) {
t.Errorf("IsMark(0x%x - 1<<31) = true, want false", base)
}
if IsNumber(r) {
t.Errorf("IsNumber(0x%x - 1<<31) = true, want false", base)
}
if IsPrint(r) {
t.Errorf("IsPrint(0x%x - 1<<31) = true, want false", base)
}
if IsPunct(r) {
t.Errorf("IsPunct(0x%x - 1<<31) = true, want false", base)
}
if IsSpace(r) {
t.Errorf("IsSpace(0x%x - 1<<31) = true, want false", base)
}
if IsSymbol(r) {
t.Errorf("IsSymbol(0x%x - 1<<31) = true, want false", base)
}
if IsTitle(r) {
t.Errorf("IsTitle(0x%x - 1<<31) = true, want false", base)
}
if IsUpper(r) {
t.Errorf("IsUpper(0x%x - 1<<31) = true, want false", base)
}
}
}