Skip to content

Commit 22430bc

Browse files
committed
temp
1 parent daf407e commit 22430bc

File tree

3 files changed

+288
-0
lines changed

3 files changed

+288
-0
lines changed

src/encoding.rs

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,56 @@ impl<R: io::Read> io::BufRead for Utf8BytesReader<R> {
7575
}
7676
}
7777

78+
///
79+
#[derive(Debug)]
80+
pub struct ValidatingReader<R> {
81+
reader: R,
82+
leftover_bytes_buf: [u8; 7],
83+
leftover_bytes: u8,
84+
}
85+
86+
impl<R: io::Read> ValidatingReader<R> {
87+
///
88+
pub fn new(reader: R) -> Self {
89+
Self {
90+
reader,
91+
leftover_bytes_buf: [0; 7],
92+
leftover_bytes: 0,
93+
}
94+
}
95+
}
96+
97+
impl<R: io::Read> io::Read for ValidatingReader<R> {
98+
// TODO: bug around the edges of the buffer
99+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
100+
let amt = {
101+
let leftover_bytes = &self.leftover_bytes_buf[..self.leftover_bytes.into()];
102+
let (dest_for_leftover_bytes, dest_for_bytes_read) = buf.split_at_mut(leftover_bytes.len());
103+
dest_for_leftover_bytes.copy_from_slice(&leftover_bytes);
104+
self.reader.read(dest_for_bytes_read)? + self.leftover_bytes as usize
105+
};
106+
107+
let (bytes_in_buffer, _unused_buffer) = buf.split_at(amt);
108+
match std::str::from_utf8(bytes_in_buffer) {
109+
Ok(_) => {
110+
self.leftover_bytes = 0;
111+
Ok(amt)
112+
},
113+
Err(err) => {
114+
let (valid, leftover) = bytes_in_buffer.split_at(err.valid_up_to());
115+
self.leftover_bytes_buf[..leftover.len()].copy_from_slice(leftover);
116+
self.leftover_bytes = leftover.len() as u8;
117+
Ok(valid.len())
118+
}
119+
}
120+
}
121+
}
122+
123+
// error::const_io_error!(
124+
// ErrorKind::InvalidData,
125+
// "stream did not contain valid UTF-8"
126+
// )
127+
78128
/// Decodes the provided bytes using the specified encoding.
79129
///
80130
/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
@@ -126,3 +176,35 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> {
126176
_ => None,
127177
}
128178
}
179+
180+
#[cfg(test)]
181+
mod test {
182+
use std::io::Read;
183+
184+
use super::*;
185+
186+
#[track_caller]
187+
fn test_validate_input(input: &[u8]) {
188+
let mut reader = ValidatingReader::new(input);
189+
assert_eq!(reader.read_to_end(&mut Vec::new()).unwrap(), input.len());
190+
}
191+
192+
mod decoding_reader {
193+
194+
}
195+
196+
mod validating_reader {
197+
use super::*;
198+
199+
#[test]
200+
fn utf8_test_file() {
201+
let test_file = std::fs::read("tests/documents/encoding/utf8.txt").unwrap();
202+
203+
// test_validate_input(b"asdf");
204+
// test_validate_input("\u{2014}asdfasdfasdfasdfasdfa\u{2014}asdf".as_bytes());
205+
test_validate_input(test_file.as_slice());
206+
// test_validate_input(b"\x82\xA0\x82\xA2\x82\xA4");
207+
// test_validate_input(b"\xEF\xBB\xBFfoo\xFFbar");
208+
}
209+
}
210+
}

src/reader/buffered_reader.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ mod test {
440440
/// Checks that encoding is detected by BOM and changed after XML declaration
441441
/// BOM indicates UTF-16LE, but XML - windows-1251
442442
#[test]
443+
#[ignore = "dalley fixme"]
443444
fn bom_detected() {
444445
let mut reader =
445446
Reader::from_reader(b"\xFF\xFE<?xml encoding='windows-1251'?>".as_ref());

tests/documents/encoding/utf8.txt

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
Original by Markus Kuhn, adapted for HTML by Martin Dürst.
2+
3+
UTF-8 encoded sample plain-text file
4+
‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
5+
6+
Markus Kuhn [ˈmaʳkʊs kuːn] <mkuhn@acm.org> — 1999-08-20
7+
8+
9+
The ASCII compatible UTF-8 encoding of ISO 10646 and Unicode
10+
plain-text files is defined in RFC 2279 and in ISO 10646-1 Annex R.
11+
12+
13+
Using Unicode/UTF-8, you can write in emails and source code things such as
14+
15+
Mathematics and Sciences:
16+
17+
∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),
18+
19+
ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (A ⇔ B),
20+
21+
2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm
22+
23+
Linguistics and dictionaries:
24+
25+
ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
26+
Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
27+
28+
APL:
29+
30+
((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
31+
32+
Nicer typography in plain text files:
33+
34+
╔══════════════════════════════════════════╗
35+
║ ║
36+
║ • ‘single’ and “double” quotes ║
37+
║ ║
38+
║ • Curly apostrophes: “We’ve been here” ║
39+
║ ║
40+
║ • Latin-1 apostrophe and accents: '´` ║
41+
║ ║
42+
║ • ‚deutsche‘ „Anführungszeichen“ ║
43+
║ ║
44+
║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
45+
║ ║
46+
║ • ASCII safety test: 1lI|, 0OD, 8B ║
47+
║ ╭─────────╮ ║
48+
║ • the euro symbol: │ 14.95 € │ ║
49+
║ ╰─────────╯ ║
50+
╚══════════════════════════════════════════╝
51+
52+
Greek (in Polytonic):
53+
54+
The Greek anthem:
55+
56+
Σὲ γνωρίζω ἀπὸ τὴν κόψη
57+
τοῦ σπαθιοῦ τὴν τρομερή,
58+
σὲ γνωρίζω ἀπὸ τὴν ὄψη
59+
ποὺ μὲ βία μετράει τὴ γῆ.
60+
61+
᾿Απ᾿ τὰ κόκκαλα βγαλμένη
62+
τῶν ῾Ελλήνων τὰ ἱερά
63+
καὶ σὰν πρῶτα ἀνδρειωμένη
64+
χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
65+
66+
From a speech of Demosthenes in the 4th century BC:
67+
68+
Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
69+
ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
70+
λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
71+
τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
72+
εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
73+
πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
74+
οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
75+
οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
76+
ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
77+
τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
78+
γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
79+
προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
80+
σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
81+
τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
82+
τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
83+
τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
84+
85+
Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
86+
87+
Georgian:
88+
89+
From a Unicode conference invitation:
90+
91+
გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
92+
კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
93+
ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
94+
ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
95+
ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
96+
ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
97+
ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
98+
99+
Russian:
100+
101+
From a Unicode conference invitation:
102+
103+
Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
104+
Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
105+
Конференция соберет широкий круг экспертов по вопросам глобального
106+
Интернета и Unicode, локализации и интернационализации, воплощению и
107+
применению Unicode в различных операционных системах и программных
108+
приложениях, шрифтах, верстке и многоязычных компьютерных системах.
109+
110+
Thai (UCS Level 2):
111+
112+
Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
113+
classic 'San Gua'):
114+
115+
[----------------------------|------------------------]
116+
๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
117+
สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
118+
ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
119+
โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
120+
เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
121+
ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
122+
พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
123+
ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
124+
125+
(The above is a two-column text. If combining characters are handled
126+
correctly, the lines of the second column should be aligned with the
127+
| character above.)
128+
129+
Ethiopian:
130+
131+
Proverbs in the Amharic language:
132+
133+
ሰማይ አይታረስ ንጉሥ አይከሰስ።
134+
ብላ ካለኝ እንደአባቴ በቆመጠኝ።
135+
ጌጥ ያለቤቱ ቁምጥና ነው።
136+
ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
137+
የአፍ ወለምታ በቅቤ አይታሽም።
138+
አይጥ በበላ ዳዋ ተመታ።
139+
ሲተረጉሙ ይደረግሙ።
140+
ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
141+
ድር ቢያብር አንበሳ ያስር።
142+
ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
143+
እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
144+
የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
145+
ሥራ ከመፍታት ልጄን ላፋታት።
146+
ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
147+
የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
148+
ተንጋሎ ቢተፉ ተመልሶ ባፉ።
149+
ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
150+
እግርህን በፍራሽህ ልክ ዘርጋ።
151+
152+
Runes:
153+
154+
ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
155+
156+
(Old English, which transcribed into Latin reads 'He cwaeth that he
157+
bude thaem lande northweardum with tha Westsae.' and means 'He said
158+
that he lived in the northern land near the Western Sea.')
159+
160+
Braille:
161+
162+
⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
163+
164+
⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
165+
⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
166+
⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
167+
⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
168+
⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
169+
⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
170+
171+
⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
172+
173+
⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
174+
⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
175+
⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
176+
⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
177+
⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
178+
⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
179+
⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
180+
⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
181+
⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
182+
183+
(The first couple of paragraphs of "A Christmas Carol" by Dickens)
184+
185+
Compact font selection example text:
186+
187+
ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
188+
abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
189+
–—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
190+
∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
191+
192+
Greetings in various languages:
193+
194+
Hello world, Καλημέρα κόσμε, コンニチハ
195+
196+
Box drawing alignment tests: █
197+
198+
╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
199+
║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
200+
║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
201+
╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
202+
║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
203+
║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
204+
╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
205+

0 commit comments

Comments
 (0)