mirror of https://go.googlesource.com/go
93 lines
2.7 KiB
Go
93 lines
2.7 KiB
Go
// Copyright 2023 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Windows UTF-16 strings can contain unpaired surrogates, which can't be
|
|
// decoded into a valid UTF-8 string. This file defines a set of functions
|
|
// that can be used to encode and decode potentially ill-formed UTF-16 strings
|
|
// by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
|
|
//
|
|
// WTF-8 is a strict superset of UTF-8, i.e. any string that is
|
|
// well-formed in UTF-8 is also well-formed in WTF-8 and the content
|
|
// is unchanged. Also, the conversion never fails and is lossless.
|
|
//
|
|
// The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string
|
|
// is that the conversion is lossless even for ill-formed UTF-16 strings.
|
|
// This property allows to read an ill-formed UTF-16 string, convert it
|
|
// to a Go string, and convert it back to the same original UTF-16 string.
|
|
//
|
|
// See go.dev/issues/59971 for more info.
|
|
|
|
package syscall
|
|
|
|
import (
|
|
"unicode/utf16"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
const (
|
|
surr1 = 0xd800
|
|
surr2 = 0xdc00
|
|
surr3 = 0xe000
|
|
|
|
tx = 0b10000000
|
|
t3 = 0b11100000
|
|
maskx = 0b00111111
|
|
mask3 = 0b00001111
|
|
|
|
rune1Max = 1<<7 - 1
|
|
rune2Max = 1<<11 - 1
|
|
)
|
|
|
|
// encodeWTF16 returns the potentially ill-formed
|
|
// UTF-16 encoding of s.
|
|
func encodeWTF16(s string, buf []uint16) []uint16 {
|
|
for i := 0; i < len(s); {
|
|
// Cannot use 'for range s' because it expects valid
|
|
// UTF-8 runes.
|
|
r, size := utf8.DecodeRuneInString(s[i:])
|
|
if r == utf8.RuneError {
|
|
// Check if s[i:] contains a valid WTF-8 encoded surrogate.
|
|
if sc := s[i:]; len(sc) >= 3 && sc[0] == 0xED && 0xA0 <= sc[1] && sc[1] <= 0xBF && 0x80 <= sc[2] && sc[2] <= 0xBF {
|
|
r = rune(sc[0]&mask3)<<12 + rune(sc[1]&maskx)<<6 + rune(sc[2]&maskx)
|
|
buf = append(buf, uint16(r))
|
|
i += 3
|
|
continue
|
|
}
|
|
}
|
|
i += size
|
|
buf = utf16.AppendRune(buf, r)
|
|
}
|
|
return buf
|
|
}
|
|
|
|
// decodeWTF16 returns the WTF-8 encoding of
|
|
// the potentially ill-formed UTF-16 s.
|
|
func decodeWTF16(s []uint16, buf []byte) []byte {
|
|
for i := 0; i < len(s); i++ {
|
|
var ar rune
|
|
switch r := s[i]; {
|
|
case r < surr1, surr3 <= r:
|
|
// normal rune
|
|
ar = rune(r)
|
|
case surr1 <= r && r < surr2 && i+1 < len(s) &&
|
|
surr2 <= s[i+1] && s[i+1] < surr3:
|
|
// valid surrogate sequence
|
|
ar = utf16.DecodeRune(rune(r), rune(s[i+1]))
|
|
i++
|
|
default:
|
|
// WTF-8 fallback.
|
|
// This only handles the 3-byte case of utf8.AppendRune,
|
|
// as surrogates always fall in that case.
|
|
ar = rune(r)
|
|
if ar > utf8.MaxRune {
|
|
ar = utf8.RuneError
|
|
}
|
|
buf = append(buf, t3|byte(ar>>12), tx|byte(ar>>6)&maskx, tx|byte(ar)&maskx)
|
|
continue
|
|
}
|
|
buf = utf8.AppendRune(buf, ar)
|
|
}
|
|
return buf
|
|
}
|