Skip to content

Commit 6c713be

Browse files
committed
Add an option on the writer to specify encoding scheme
Only Utf8 and Utf8WithBom are supported, as encoding_rs doesn't and likely won't support other encodings.
1 parent 11e483a commit 6c713be

File tree

2 files changed

+84
-8
lines changed

2 files changed

+84
-8
lines changed

src/encoding.rs

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,13 @@ use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
99
use crate::Error;
1010
use crate::Result;
1111

12+
/// Unicode "byte order mark" encoded as UTF-8
13+
pub static UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
14+
/// Unicode "byte order mark" encoded as UTF-16 with little-endian byte order
15+
pub static UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
16+
/// Unicode "byte order mark" encoded as UTF-16 with big-endian byte order
17+
pub static UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
18+
1219
/// Decoder of byte slices into strings.
1320
///
1421
/// If feature `encoding` is enabled, this encoding taken from the `"encoding"`
@@ -62,7 +69,7 @@ impl Decoder {
6269
///
6370
/// If you instead want to use XML declared encoding, use the `encoding` feature
6471
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
65-
let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
72+
let bytes = if bytes.starts_with(UTF8_BOM) {
6673
&bytes[3..]
6774
} else {
6875
bytes
@@ -131,11 +138,11 @@ pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
131138

132139
#[cfg(feature = "encoding")]
133140
fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
134-
if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
141+
if encoding == UTF_8 && bytes.starts_with(UTF8_BOM) {
135142
bytes.split_at(3)
136-
} else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) {
143+
} else if encoding == UTF_16LE && bytes.starts_with(UTF16_LE_BOM) {
137144
bytes.split_at(2)
138-
} else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) {
145+
} else if encoding == UTF_16BE && bytes.starts_with(UTF16_LE_BOM) {
139146
bytes.split_at(2)
140147
} else {
141148
(&[], bytes)
@@ -172,9 +179,9 @@ fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
172179
pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
173180
match bytes {
174181
// with BOM
175-
_ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE),
176-
_ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE),
177-
_ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8),
182+
_ if bytes.starts_with(UTF16_BE_BOM) => Some(UTF_16BE),
183+
_ if bytes.starts_with(UTF16_LE_BOM) => Some(UTF_16LE),
184+
_ if bytes.starts_with(UTF8_BOM) => Some(UTF_8),
178185

179186
// without BOM
180187
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2

src/writer.rs

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,30 @@
11
//! Contains high-level interface for an events-based XML emitter.
22
3+
use crate::encoding::UTF8_BOM;
34
use crate::errors::{Error, Result};
45
use crate::events::{attributes::Attribute, BytesCData, BytesStart, BytesText, Event};
56
use std::io::Write;
67

8+
/// Writer-side encoding schemes supported by quick-xml.
9+
///
10+
/// Currently, `quick-xml` only supports UTF-8 as an output encoding as the `encoding_rs`
11+
/// library does not provide encoders for any other encodings. If you need to write UTF-16
12+
/// encoded XML, consider writing the XML with a UTF-8 encoding and then re-encoding the file.
13+
#[derive(Clone, Debug)]
14+
pub enum EncodingScheme {
15+
/// UTF-8 text with no "BOM". This is the default, and recommended value.
16+
Utf8,
17+
/// UTF-8 with a "BOM" identifier. The standard recommends against this but some software
18+
/// requires it to be present.
19+
Utf8WithBom,
20+
}
21+
22+
impl Default for EncodingScheme {
23+
fn default() -> Self {
24+
Self::Utf8
25+
}
26+
}
27+
728
/// XML writer.
829
///
930
/// Writes XML `Event`s to a `Write` implementor.
@@ -57,6 +78,8 @@ pub struct Writer<W: Write> {
5778
/// underlying writer
5879
writer: W,
5980
indent: Option<Indentation>,
81+
encoding: EncodingScheme,
82+
first_write: bool,
6083
}
6184

6285
impl<W: Write> Writer<W> {
@@ -65,6 +88,8 @@ impl<W: Write> Writer<W> {
6588
Writer {
6689
writer: inner,
6790
indent: None,
91+
encoding: EncodingScheme::default(),
92+
first_write: false,
6893
}
6994
}
7095

@@ -73,6 +98,23 @@ impl<W: Write> Writer<W> {
7398
Writer {
7499
writer: inner,
75100
indent: Some(Indentation::new(indent_char, indent_size)),
101+
encoding: EncodingScheme::default(),
102+
first_write: true,
103+
}
104+
}
105+
106+
/// Creates a Writer with configured whitespace indents from a generic Write
107+
pub fn new_with_indent_and_encoding(
108+
inner: W,
109+
indent_char: u8,
110+
indent_size: usize,
111+
encoding: EncodingScheme,
112+
) -> Writer<W> {
113+
Writer {
114+
writer: inner,
115+
indent: Some(Indentation::new(indent_char, indent_size)),
116+
encoding: encoding,
117+
first_write: true,
76118
}
77119
}
78120

@@ -129,7 +171,15 @@ impl<W: Write> Writer<W> {
129171

130172
/// Writes bytes
131173
#[inline]
132-
pub fn write(&mut self, value: &[u8]) -> Result<()> {
174+
fn write(&mut self, value: &[u8]) -> Result<()> {
175+
// The BOM should be the very first thing written to the file, but it should only be written once
176+
if self.first_write {
177+
match self.encoding {
178+
EncodingScheme::Utf8WithBom => self.writer.write_all(UTF8_BOM)?,
179+
_ => (),
180+
}
181+
self.first_write = false;
182+
}
133183
self.writer.write_all(value).map_err(Error::Io)
134184
}
135185

@@ -579,4 +629,23 @@ mod indentation {
579629
</outer>"#
580630
);
581631
}
632+
633+
#[test]
634+
fn write_utf8_with_bom() {
635+
let mut buffer = Vec::new();
636+
let mut writer =
637+
Writer::new_with_indent_and_encoding(&mut buffer, b' ', 4, EncodingScheme::Utf8WithBom);
638+
639+
writer
640+
.create_element("paired")
641+
.with_attribute(("attr1", "value1"))
642+
.with_attribute(("attr2", "value2"))
643+
.write_text_content(BytesText::new("text"))
644+
.expect("failure");
645+
646+
assert_eq!(
647+
&buffer,
648+
"\u{FEFF}<paired attr1=\"value1\" attr2=\"value2\">text</paired>".as_bytes()
649+
);
650+
}
582651
}

0 commit comments

Comments
 (0)