Remove BOM from first-emitted text event

dralley · dralley · commit 2221871e0981 · 2022-08-16T00:29:31.000-04:00
diff --git a/src/encoding.rs b/src/encoding.rs
@@ -152,7 +152,7 @@ fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8],
 }
 
 #[cfg(feature = "encoding")]
-fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
+pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
     let (_, bytes) = split_at_bom(bytes, encoding);
     bytes
 }
diff --git a/src/reader/parser.rs b/src/reader/parser.rs
@@ -1,9 +1,7 @@
 #[cfg(feature = "encoding")]
 use encoding_rs::UTF_8;
 
-#[cfg(feature = "encoding")]
-use crate::encoding::detect_encoding;
-use crate::encoding::Decoder;
+use crate::encoding::{self, Decoder};
 use crate::errors::{Error, Result};
 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
 #[cfg(feature = "encoding")]
@@ -68,23 +66,31 @@ impl Parser {
     ///
     /// [`Text`]: Event::Text
     pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
-        #[cfg(feature = "encoding")]
-        if first && self.encoding.can_be_refined() {
-            if let Some(encoding) = detect_encoding(bytes) {
-                self.encoding = EncodingRef::BomDetected(encoding);
-            }
-        }
+        let mut content = bytes;
 
-        let content = if self.trim_text_end {
+        if self.trim_text_end {
             // Skip the ending '<'
             let len = bytes
                 .iter()
                 .rposition(|&b| !is_whitespace(b))
                 .map_or_else(|| bytes.len(), |p| p + 1);
-            &bytes[..len]
-        } else {
-            bytes
-        };
+            content = &bytes[..len];
+        }
+
+        if first {
+            #[cfg(feature = "encoding")]
+            if self.encoding.can_be_refined() {
+                if let Some(encoding) = encoding::detect_encoding(bytes) {
+                    self.encoding = EncodingRef::BomDetected(encoding);
+                    content = encoding::remove_bom(content, encoding);
+                }
+            }
+            #[cfg(not(feature = "encoding"))]
+            if bytes.starts_with(encoding::UTF8_BOM) {
+                content = &bytes[encoding::UTF8_BOM.len()..];
+            }
+        }
+
         Ok(Event::Text(BytesText::wrap(content, self.decoder())))
     }
 
diff --git a/tests/encodings.rs b/tests/encodings.rs
@@ -1,4 +1,6 @@
+#[allow(dead_code)]
 use quick_xml::events::Event;
+#[allow(dead_code)]
 use quick_xml::Reader;
 
 #[cfg(feature = "encoding")]

Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,7 @@ fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8],`
`152`	`152`	`}`
`153`	`153`
`154`	`154`	`#[cfg(feature = "encoding")]`
`155`		`-fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {`
	`155`	`+pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {`
`156`	`156`	`let (_, bytes) = split_at_bom(bytes, encoding);`
`157`	`157`	`bytes`
`158`	`158`	`}`