Skip to content

Commit b4f8d34

Browse files
committed
Add an option on the writer to specify encoding scheme
Only Utf8 and Utf8WithBom are supported, as encoding_rs doesn't and likely won't support other encodings.
1 parent 11e483a commit b4f8d34

File tree

3 files changed

+95
-12
lines changed

3 files changed

+95
-12
lines changed

Changelog.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@
4242
- [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
4343
- [#455]: Change return type of all `read_to_end*` methods to return a span between tags
4444
- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags
45-
45+
- [#458]: Added an `EncodingScheme` configuration option to `Writer` to allow writing documents
46+
with a BOM. Currently UTF-8 is the only supported encoding however it could be extended to cover
47+
others in the future.
4648

4749
### Bug Fixes
4850

@@ -186,11 +188,13 @@
186188
- [#440]: Removed `Deserializer::from_slice` and `quick_xml::de::from_slice` methods because deserializing from a byte
187189
array cannot guarantee borrowing due to possible copying while decoding.
188190

189-
- [#455]: Removed `Reader::read_text_into` which is only not a better wrapper over match on `Event::Text`
191+
- [#455]: Removed `Reader::read_text_into` which is just a thin wrapper around match on `Event::Text`
190192

191193
- [#456]: Reader and writer stuff grouped under `reader` and `writer` modules.
192194
You still can use re-exported definitions from a crate root
193195

196+
- [#458]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful.
197+
194198
### New Tests
195199

196200
- [#9]: Added tests for incorrect nested tags in input
@@ -234,7 +238,7 @@
234238
[#450]: https://github.com/tafia/quick-xml/pull/450
235239
[#455]: https://github.com/tafia/quick-xml/pull/455
236240
[#456]: https://github.com/tafia/quick-xml/pull/456
237-
241+
[#458]: https://github.com/tafia/quick-xml/pull/458
238242

239243
## 0.23.0 -- 2022-05-08
240244

src/encoding.rs

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
99
use crate::Error;
1010
use crate::Result;
1111

12+
/// Unicode "byte order mark" encoded as UTF-8
13+
pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
14+
/// Unicode "byte order mark" encoded as UTF-16 with little-endian byte order
15+
#[allow(dead_code)]
16+
pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
17+
/// Unicode "byte order mark" encoded as UTF-16 with big-endian byte order
18+
#[allow(dead_code)]
19+
pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
20+
1221
/// Decoder of byte slices into strings.
1322
///
1423
/// If feature `encoding` is enabled, this encoding taken from the `"encoding"`
@@ -62,7 +71,7 @@ impl Decoder {
6271
///
6372
/// If you instead want to use XML declared encoding, use the `encoding` feature
6473
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
65-
let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
74+
let bytes = if bytes.starts_with(UTF8_BOM) {
6675
&bytes[3..]
6776
} else {
6877
bytes
@@ -131,11 +140,11 @@ pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
131140

132141
#[cfg(feature = "encoding")]
133142
fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
134-
if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
143+
if encoding == UTF_8 && bytes.starts_with(UTF8_BOM) {
135144
bytes.split_at(3)
136-
} else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) {
145+
} else if encoding == UTF_16LE && bytes.starts_with(UTF16_LE_BOM) {
137146
bytes.split_at(2)
138-
} else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) {
147+
} else if encoding == UTF_16BE && bytes.starts_with(UTF16_BE_BOM) {
139148
bytes.split_at(2)
140149
} else {
141150
(&[], bytes)
@@ -172,9 +181,9 @@ fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
172181
pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
173182
match bytes {
174183
// with BOM
175-
_ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE),
176-
_ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE),
177-
_ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8),
184+
_ if bytes.starts_with(UTF16_BE_BOM) => Some(UTF_16BE),
185+
_ if bytes.starts_with(UTF16_LE_BOM) => Some(UTF_16LE),
186+
_ if bytes.starts_with(UTF8_BOM) => Some(UTF_8),
178187

179188
// without BOM
180189
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2

src/writer.rs

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,30 @@
11
//! Contains high-level interface for an events-based XML emitter.
22
3+
use crate::encoding::UTF8_BOM;
34
use crate::errors::{Error, Result};
45
use crate::events::{attributes::Attribute, BytesCData, BytesStart, BytesText, Event};
56
use std::io::Write;
67

8+
/// Writer-side encoding schemes supported by quick-xml.
9+
///
10+
/// Currently, `quick-xml` only supports UTF-8 as an output encoding as the `encoding_rs`
11+
/// library does not provide encoders for any other encodings. If you need to write UTF-16
12+
/// encoded XML, consider writing the XML with a UTF-8 encoding and then re-encoding the file.
13+
#[derive(Clone, Debug)]
14+
pub enum EncodingScheme {
15+
/// UTF-8 text with no "BOM". This is the default, and recommended value.
16+
Utf8,
17+
/// UTF-8 with a "BOM" identifier. The standard recommends against this but some software
18+
/// struggles to detect the encoding properly if it is not present.
19+
Utf8WithBom,
20+
}
21+
22+
impl Default for EncodingScheme {
23+
fn default() -> Self {
24+
Self::Utf8
25+
}
26+
}
27+
728
/// XML writer.
829
///
930
/// Writes XML `Event`s to a `Write` implementor.
@@ -57,6 +78,8 @@ pub struct Writer<W: Write> {
5778
/// underlying writer
5879
writer: W,
5980
indent: Option<Indentation>,
81+
encoding: EncodingScheme,
82+
first_write: bool,
6083
}
6184

6285
impl<W: Write> Writer<W> {
@@ -65,14 +88,34 @@ impl<W: Write> Writer<W> {
6588
Writer {
6689
writer: inner,
6790
indent: None,
91+
encoding: EncodingScheme::default(),
92+
first_write: false,
6893
}
6994
}
7095

71-
/// Creates a Writer with configured whitespace indents from a generic Write
96+
/// Creates a Writer from a generic Write implementor with configured whitespace indents
7297
pub fn new_with_indent(inner: W, indent_char: u8, indent_size: usize) -> Writer<W> {
7398
Writer {
7499
writer: inner,
75100
indent: Some(Indentation::new(indent_char, indent_size)),
101+
encoding: EncodingScheme::default(),
102+
first_write: true,
103+
}
104+
}
105+
106+
/// Creates a Writer from a generic Write implementor with configured whitespace indents and a
107+
/// specified encoding scheme.
108+
pub fn new_with_indent_and_encoding(
109+
inner: W,
110+
indent_char: u8,
111+
indent_size: usize,
112+
encoding_scheme: EncodingScheme,
113+
) -> Writer<W> {
114+
Writer {
115+
writer: inner,
116+
indent: Some(Indentation::new(indent_char, indent_size)),
117+
encoding: encoding_scheme,
118+
first_write: true,
76119
}
77120
}
78121

@@ -129,7 +172,15 @@ impl<W: Write> Writer<W> {
129172

130173
/// Writes bytes
131174
#[inline]
132-
pub fn write(&mut self, value: &[u8]) -> Result<()> {
175+
pub(crate) fn write(&mut self, value: &[u8]) -> Result<()> {
176+
// The BOM should be the very first thing written to the file, but it should only be written once
177+
if self.first_write {
178+
match self.encoding {
179+
EncodingScheme::Utf8WithBom => self.writer.write_all(UTF8_BOM)?,
180+
_ => (),
181+
}
182+
self.first_write = false;
183+
}
133184
self.writer.write_all(value).map_err(Error::Io)
134185
}
135186

@@ -579,4 +630,23 @@ mod indentation {
579630
</outer>"#
580631
);
581632
}
633+
634+
#[test]
635+
fn write_utf8_with_bom() {
636+
let mut buffer = Vec::new();
637+
let mut writer =
638+
Writer::new_with_indent_and_encoding(&mut buffer, b' ', 4, EncodingScheme::Utf8WithBom);
639+
640+
writer
641+
.create_element("paired")
642+
.with_attribute(("attr1", "value1"))
643+
.with_attribute(("attr2", "value2"))
644+
.write_text_content(BytesText::new("text"))
645+
.expect("failure");
646+
647+
assert_eq!(
648+
&buffer,
649+
"\u{FEFF}<paired attr1=\"value1\" attr2=\"value2\">text</paired>".as_bytes()
650+
);
651+
}
582652
}

0 commit comments

Comments
 (0)