forked from TomOnTime/utfutil
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utfutil.go
208 lines (176 loc) · 5.9 KB
/
utfutil.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
// Package utfutil provides methods that make it easy to read data in an UTF-encoding agnostic.
package utfutil
// These functions autodetect UTF BOM and return UTF-8. If no
// BOM is found, a hint is provided as to which encoding to assume.
// You can use them as replacements for os.Open() and ioutil.ReadFile()
// when the encoding of the file is unknown.
// utfutil.OpenFile() is a replacement for os.Open().
// utfutil.ReadFile() is a replacement for ioutil.ReadFile().
// utfutil.NewScanner() takes a filename and returns a Scanner.
// utfutil.NewReader() rewraps an existing scanner to make it UTF-encoding agnostic.
// utfutil.BytesReader() takes a []byte and decodes it to UTF-8.
// When there is no BOM, it is impossible to guess correctly 100%
// of the time. Therefore, the functions take a 2nd parameter of type
// "EncodingHint" where you specify the default encoding for BOM-less
// data.
// In the future we'd like to have a hint called AUTO that uses
// uchatdet (or a Go rewrite) to guess.
// Inspiration: I wrote this after spending half a day trying
// to figure out how to use unicode.BOMOverride.
// Hopefully this will save other golang newbies from the same.
// (golang.org/x/text/encoding/unicode)
import (
"bufio"
"bytes"
"io"
"io/ioutil"
"os"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
)
// EncodingHint indicates the file's encoding if there is no BOM.
type EncodingHint int
const (
// UTF8 indicates the specified encoding.
UTF8 EncodingHint = iota
// UTF16LE indicates the specified encoding.
UTF16LE
// UTF16BE indicates the specified encoding.
UTF16BE
// WINDOWS indicates that the file came from a MS-Windows system
WINDOWS = UTF16LE
// POSIX indicates that the file came from Unix or Unix-like systems
POSIX = UTF8
// HTML5 indicates that the file came from the web
HTML5 = UTF8
)
// UTFReadCloser describes the utfutil ReadCloser structure.
type UTFReadCloser interface {
Read(p []byte) (n int, err error)
Close() error
}
// ReadCloser is a readcloser for the UTFUtil package.
type readCloser struct {
file *os.File
reader io.Reader
}
// Read implements the standard Reader interface.
func (u readCloser) Read(p []byte) (n int, err error) {
return u.reader.Read(p)
}
// Close implements the standard Closer interface.
func (u readCloser) Close() error {
if u.file != nil {
return u.file.Close()
}
return nil
}
// UTFScanCloser describes a new utfutil ScanCloser structure.
// It's similar to ReadCloser, but with a scanner instead of a reader.
type UTFScanCloser interface {
Buffer(buf []byte, max int)
Bytes() []byte
Err() error
Scan() bool
Split(split bufio.SplitFunc)
Text() string
Close() error
}
type scanCloser struct {
file UTFReadCloser
scanner *bufio.Scanner
}
// Buffer will run the Buffer function on the underlying bufio.Scanner.
func (sc scanCloser) Buffer(buf []byte, max int) {
sc.scanner.Buffer(buf, max)
}
// Bytes will run the Bytes function on the underlying bufio.Scanner.
func (sc scanCloser) Bytes() []byte {
return sc.scanner.Bytes()
}
// Err will run the Err function on the underlying bufio.Scanner.
func (sc scanCloser) Err() error {
return sc.scanner.Err()
}
// Scan will run the Scan function on the underlying bufio.Scanner.
func (sc scanCloser) Scan() bool {
return sc.scanner.Scan()
}
// Split will run the Split function on the underlying bufio.Scanner.
func (sc scanCloser) Split(split bufio.SplitFunc) {
sc.scanner.Split(split)
}
// Text will return the text from the underlying bufio.Scanner.
func (sc scanCloser) Text() string {
return sc.scanner.Text()
}
// Close will close the underlying file handle.
func (sc scanCloser) Close() error {
return sc.file.Close()
}
// About utfutil.HTML5:
// This technique is recommended by the W3C for use in HTML 5:
// "For compatibility with deployed content, the byte order
// mark (also known as BOM) is considered more authoritative
// than anything else." http://www.w3.org/TR/encoding/#specification-hooks
// OpenFile is the equivalent of os.Open().
func OpenFile(name string, d EncodingHint) (UTFReadCloser, error) {
f, err := os.Open(name)
if err != nil {
return nil, err
}
rc := readCloser{file: f}
return NewReader(rc, d), nil
}
// ReadFile is the equivalent of ioutil.ReadFile()
func ReadFile(name string, d EncodingHint) ([]byte, error) {
file, err := OpenFile(name, d)
if err != nil {
return nil, err
}
defer file.Close()
return ioutil.ReadAll(file)
}
// NewScanner is a convenience function that takes a filename and returns a scanner.
func NewScanner(name string, d EncodingHint) (UTFScanCloser, error) {
f, err := OpenFile(name, d)
if err != nil {
return nil, err
}
return scanCloser{
scanner: bufio.NewScanner(f),
file: f,
}, nil
}
// NewReader wraps a Reader to decode Unicode to UTF-8 as it reads.
func NewReader(r io.Reader, d EncodingHint) UTFReadCloser {
var decoder *encoding.Decoder
switch d {
case UTF8:
// Make a transformer that assumes UTF-8 but abides by the BOM.
decoder = unicode.UTF8.NewDecoder()
case UTF16LE:
// Make an tranformer that decodes MS-Windows (16LE) UTF files:
winutf := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
// Make a transformer that is like winutf, but abides by BOM if found:
decoder = winutf.NewDecoder()
case UTF16BE:
// Make an tranformer that decodes UTF-16BE files:
utf16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
// Make a transformer that is like utf16be, but abides by BOM if found:
decoder = utf16be.NewDecoder()
}
// Make a Reader that uses utf16bom:
if rc, ok := r.(readCloser); ok {
rc.reader = transform.NewReader(rc.file, unicode.BOMOverride(decoder))
return rc
}
return readCloser{
reader: transform.NewReader(r, unicode.BOMOverride(decoder)),
}
}
// BytesReader is a convenience function that takes a []byte and decodes them to UTF-8.
func BytesReader(b []byte, d EncodingHint) io.Reader {
return NewReader(bytes.NewReader(b), d)
}