Skip to content

Commit 8f60b58

Browse files
committed
Remove StartText
StartText would be out of place once all events are expected to contain UTF-8. Additionally the decoder implementation strips BOM bytes out of the bytestream so there's no good way to access them.
1 parent 87d241a commit 8f60b58

File tree

10 files changed

+10
-282
lines changed

10 files changed

+10
-282
lines changed

Changelog.md

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
- [#180]: Make `Decoder` struct public. You already had access to it via the
2121
`Reader::decoder()` method, but could not name it in the code. Now the preferred
2222
way to access decoding functionality is via this struct
23-
- [#191]: New event variant `StartText` emitted for bytes before the XML declaration
24-
or a start comment or a tag. For streams with BOM this event will contain a BOM
2523
- [#395]: Add support for XML Schema `xs:list`
2624
- [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore
2725
the XML declared encoding and always use UTF-8
@@ -99,15 +97,6 @@
9997
`Decoder::decode()` and `Decoder::decode_with_bom_removal()`.
10098
Use `reader.decoder().decode_*(...)` instead of `reader.decode_*(...)` for now.
10199
`Reader::encoding()` is replaced by `Decoder::encoding()` as well
102-
- [#191]: Remove poorly designed `BytesText::unescape_and_decode_without_bom()` and
103-
`BytesText::unescape_and_decode_without_bom_with_custom_entities()`. Although these methods worked
104-
as expected, this was only due to good luck. They was replaced by the
105-
`BytesStartText::decode_with_bom_removal()`:
106-
- conceptually, you should decode BOM only for the first `Text` event from the
107-
reader (since now `StartText` event is emitted instead for this)
108-
- text before the first tag is not an XML content at all, so it is meaningless
109-
to try to unescape something in it
110-
111100
- [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is
112101
disabled. Signatures of functions are now the same regardless of whether or not the feature is
113102
enabled, and an error will be returned instead of performing replacements for invalid characters

benches/microbenches.rs

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -118,20 +118,6 @@ fn read_resolved_event_into(c: &mut Criterion) {
118118
/// Benchmarks, how fast individual event parsed
119119
fn one_event(c: &mut Criterion) {
120120
let mut group = c.benchmark_group("One event");
121-
group.bench_function("StartText", |b| {
122-
let src = "Hello world!".repeat(512 / 12);
123-
b.iter(|| {
124-
let mut r = Reader::from_str(&src);
125-
let mut nbtxt = criterion::black_box(0);
126-
r.check_end_names(false).check_comments(false);
127-
match r.read_event() {
128-
Ok(Event::StartText(e)) => nbtxt += e.len(),
129-
something_else => panic!("Did not expect {:?}", something_else),
130-
};
131-
132-
assert_eq!(nbtxt, 504);
133-
})
134-
});
135121

136122
group.bench_function("Start", |b| {
137123
let src = format!(r#"<hello target="{}">"#, "world".repeat(512 / 5));

src/de/mod.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -929,10 +929,6 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
929929
let event = loop {
930930
let e = self.reader.read_event_into(&mut self.buf)?;
931931
match e {
932-
//TODO: Probably not the best idea treat StartText as usual text
933-
// Usually this event will represent a BOM
934-
// Changing this requires review of the serde-de::top_level::one_element test
935-
Event::StartText(e) => break Ok(DeEvent::Text(e.into_owned().into())),
936932
Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())),
937933
Event::End(e) => break Ok(DeEvent::End(e.into_owned())),
938934
Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())),
@@ -974,10 +970,6 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
974970
loop {
975971
let e = self.reader.read_event()?;
976972
match e {
977-
//TODO: Probably not the best idea treat StartText as usual text
978-
// Usually this event will represent a BOM
979-
// Changing this requires review of the serde-de::top_level::one_element test
980-
Event::StartText(e) => break Ok(DeEvent::Text(e.into())),
981973
Event::Start(e) => break Ok(DeEvent::Start(e)),
982974
Event::End(e) => break Ok(DeEvent::End(e)),
983975
Event::Text(e) => break Ok(DeEvent::Text(e)),

src/events/mod.rs

Lines changed: 0 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -50,69 +50,6 @@ use crate::name::{LocalName, QName};
5050
use crate::utils::write_cow_string;
5151
use attributes::{Attribute, Attributes};
5252

53-
/// Text that appeared before an XML declaration, a start element or a comment.
54-
///
55-
/// In well-formed XML it could contain a Byte-Order-Mark (BOM). If this event
56-
/// contains something else except BOM, the XML should be considered ill-formed.
57-
///
58-
/// This is a reader-only event. If you need to write a text before the first tag,
59-
/// use the [`BytesText`] event.
60-
#[derive(Debug, Clone, Eq, PartialEq)]
61-
pub struct BytesStartText<'a> {
62-
content: BytesText<'a>,
63-
}
64-
65-
impl<'a> BytesStartText<'a> {
66-
/// Converts the event into an owned event.
67-
pub fn into_owned(self) -> BytesStartText<'static> {
68-
BytesStartText {
69-
content: self.content.into_owned(),
70-
}
71-
}
72-
73-
/// Extracts the inner `Cow` from the `BytesStartText` event container.
74-
#[inline]
75-
pub fn into_inner(self) -> Cow<'a, [u8]> {
76-
self.content.into_inner()
77-
}
78-
79-
/// Converts the event into a borrowed event.
80-
#[inline]
81-
pub fn borrow(&self) -> BytesStartText {
82-
BytesStartText {
83-
content: self.content.borrow(),
84-
}
85-
}
86-
87-
/// Decodes bytes of event, stripping byte order mark (BOM) if it is presented
88-
/// in the event.
89-
///
90-
/// This method does not unescapes content, because no escape sequences can
91-
/// appeared in the BOM or in the text before the first tag.
92-
pub fn decode_with_bom_removal(&self) -> Result<String> {
93-
//TODO: Fix lifetime issue - it should be possible to borrow string
94-
let decoded = self.content.decoder.decode_with_bom_removal(&*self)?;
95-
96-
Ok(decoded.to_string())
97-
}
98-
}
99-
100-
impl<'a> Deref for BytesStartText<'a> {
101-
type Target = BytesText<'a>;
102-
103-
fn deref(&self) -> &Self::Target {
104-
&self.content
105-
}
106-
}
107-
108-
impl<'a> From<BytesText<'a>> for BytesStartText<'a> {
109-
fn from(content: BytesText<'a>) -> Self {
110-
Self { content }
111-
}
112-
}
113-
114-
////////////////////////////////////////////////////////////////////////////////////////////////////
115-
11653
/// Opening tag data (`Event::Start`), with optional attributes.
11754
///
11855
/// `<name attr="value">`.
@@ -796,12 +733,6 @@ impl<'a> Deref for BytesText<'a> {
796733
}
797734
}
798735

799-
impl<'a> From<BytesStartText<'a>> for BytesText<'a> {
800-
fn from(content: BytesStartText<'a>) -> Self {
801-
content.content
802-
}
803-
}
804-
805736
////////////////////////////////////////////////////////////////////////////////////////////////////
806737

807738
/// CDATA content contains unescaped data from the reader. If you want to write them as a text,
@@ -940,56 +871,6 @@ impl<'a> Deref for BytesCData<'a> {
940871
/// [`Reader::read_event_into`]: crate::reader::Reader::read_event_into
941872
#[derive(Clone, Debug, Eq, PartialEq)]
942873
pub enum Event<'a> {
943-
/// Text that appeared before the first opening tag or an [XML declaration].
944-
/// [According to the XML standard][std], no text allowed before the XML
945-
/// declaration. However, if there is a BOM in the stream, some data may be
946-
/// present.
947-
///
948-
/// When this event is generated, it is the very first event emitted by the
949-
/// [`Reader`], and there can be the only one such event.
950-
///
951-
/// The [`Writer`] writes content of this event "as is" without encoding or
952-
/// escaping. If you write it, it should be written first and only one time
953-
/// (but writer does not enforce that).
954-
///
955-
/// # Examples
956-
///
957-
/// ```
958-
/// # use pretty_assertions::assert_eq;
959-
/// use std::borrow::Cow;
960-
/// use quick_xml::Reader;
961-
/// use quick_xml::events::Event;
962-
///
963-
/// // XML in UTF-8 with BOM
964-
/// let xml = b"\xEF\xBB\xBF<?xml version='1.0'?>".as_ref();
965-
/// let mut reader = Reader::from_reader(xml);
966-
/// let mut buf = Vec::new();
967-
/// let mut events_processed = 0;
968-
/// loop {
969-
/// match reader.read_event_into(&mut buf) {
970-
/// Ok(Event::StartText(e)) => {
971-
/// assert_eq!(events_processed, 0);
972-
/// // Content contains BOM
973-
/// assert_eq!(e.into_inner(), Cow::Borrowed(b"\xEF\xBB\xBF"));
974-
/// }
975-
/// Ok(Event::Decl(_)) => {
976-
/// assert_eq!(events_processed, 1);
977-
/// }
978-
/// Ok(Event::Eof) => {
979-
/// assert_eq!(events_processed, 2);
980-
/// break;
981-
/// }
982-
/// e => panic!("Unexpected event {:?}", e),
983-
/// }
984-
/// events_processed += 1;
985-
/// }
986-
/// ```
987-
///
988-
/// [XML declaration]: Event::Decl
989-
/// [std]: https://www.w3.org/TR/xml11/#NT-document
990-
/// [`Reader`]: crate::reader::Reader
991-
/// [`Writer`]: crate::writer::Writer
992-
StartText(BytesStartText<'a>),
993874
/// Start tag (with attributes) `<tag attr="value">`.
994875
Start(BytesStart<'a>),
995876
/// End tag `</tag>`.
@@ -1017,7 +898,6 @@ impl<'a> Event<'a> {
1017898
/// buffer used when reading but incurring a new, separate allocation.
1018899
pub fn into_owned(self) -> Event<'static> {
1019900
match self {
1020-
Event::StartText(e) => Event::StartText(e.into_owned()),
1021901
Event::Start(e) => Event::Start(e.into_owned()),
1022902
Event::End(e) => Event::End(e.into_owned()),
1023903
Event::Empty(e) => Event::Empty(e.into_owned()),
@@ -1035,7 +915,6 @@ impl<'a> Event<'a> {
1035915
#[inline]
1036916
pub fn borrow(&self) -> Event {
1037917
match self {
1038-
Event::StartText(e) => Event::StartText(e.borrow()),
1039918
Event::Start(e) => Event::Start(e.borrow()),
1040919
Event::End(e) => Event::End(e.borrow()),
1041920
Event::Empty(e) => Event::Empty(e.borrow()),
@@ -1055,7 +934,6 @@ impl<'a> Deref for Event<'a> {
1055934

1056935
fn deref(&self) -> &[u8] {
1057936
match *self {
1058-
Event::StartText(ref e) => &*e,
1059937
Event::Start(ref e) | Event::Empty(ref e) => &*e,
1060938
Event::End(ref e) => &*e,
1061939
Event::Text(ref e) => &*e,

src/reader/mod.rs

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ pub type Span = Range<usize>;
286286
/// subgraph _
287287
/// direction LR
288288
///
289-
/// Init -- "(no event)"\nStartText --> OpenedTag
289+
/// Init -- "(no event)"\n --> OpenedTag
290290
/// OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag
291291
/// ClosedTag -- "#lt;false#gt;\n(no event)"\nText --> OpenedTag
292292
/// end
@@ -297,13 +297,13 @@ pub type Span = Range<usize>;
297297
#[derive(Clone)]
298298
enum ParseState {
299299
/// Initial state in which reader stay after creation. Transition from that
300-
/// state could produce a `StartText`, `Decl`, `Comment` or `Start` event.
301-
/// The next state is always `OpenedTag`. The reader will never return to this
302-
/// state. The event emitted during transition to `OpenedTag` is a `StartEvent`
303-
/// if the first symbol not `<`, otherwise no event are emitted.
300+
/// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
301+
/// state is always `OpenedTag`. The reader will never return to this state. The
302+
/// event emitted during transition to `OpenedTag` is a `StartEvent` if the
303+
/// first symbol not `<`, otherwise no event are emitted.
304304
Init,
305305
/// State after seeing the `<` symbol. Depending on the next symbol all other
306-
/// events (except `StartText`) could be generated.
306+
/// events could be generated.
307307
///
308308
/// After generating ane event the reader moves to the `ClosedTag` state.
309309
OpenedTag,
@@ -553,8 +553,6 @@ impl<R> Reader<R> {
553553
}
554554

555555
/// Read until '<' is found and moves reader to an `OpenedTag` state.
556-
///
557-
/// Return a `StartText` event if `first` is `true` and a `Text` event otherwise
558556
fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result<Event<'i>>
559557
where
560558
R: XmlSource<'i, B>,
@@ -1564,16 +1562,6 @@ mod test {
15641562
use crate::reader::Reader;
15651563
use pretty_assertions::assert_eq;
15661564

1567-
#[$test]
1568-
$($async)? fn start_text() {
1569-
let mut reader = Reader::from_str("bom");
1570-
1571-
assert_eq!(
1572-
reader.$read_event($buf) $(.$await)? .unwrap(),
1573-
Event::StartText(BytesText::from_escaped("bom").into())
1574-
);
1575-
}
1576-
15771565
#[$test]
15781566
$($async)? fn declaration() {
15791567
let mut reader = Reader::from_str("<?xml ?>");

src/reader/parser.rs

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,11 @@ pub(super) struct Parser {
6363
}
6464

6565
impl Parser {
66-
/// Trims whitespaces from `bytes`, if required, and returns a [`StartText`]
67-
/// or a [`Text`] event. When [`StartText`] is returned, the method can change
68-
/// the encoding of the reader, detecting it from the beginning of the stream.
66+
/// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event.
6967
///
7068
/// # Parameters
7169
/// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
72-
/// - `first`: if `true`, then this is the first call of that function,
73-
/// i. e. data from the start of stream and [`StartText`] will be returned,
74-
/// otherwise [`Text`] will be returned
7570
///
76-
/// [`StartText`]: Event::StartText
7771
/// [`Text`]: Event::Text
7872
pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
7973
#[cfg(feature = "encoding")]
@@ -93,12 +87,7 @@ impl Parser {
9387
} else {
9488
bytes
9589
};
96-
97-
Ok(if first {
98-
Event::StartText(BytesText::wrap(content, self.decoder()).into())
99-
} else {
100-
Event::Text(BytesText::wrap(content, self.decoder()))
101-
})
90+
Ok(Event::Text(BytesText::wrap(content, self.decoder())))
10291
}
10392

10493
/// reads `BytesElement` starting with a `!`,

src/writer.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ impl<W: Write> Writer<W> {
8989
pub fn write_event<'a, E: AsRef<Event<'a>>>(&mut self, event: E) -> Result<()> {
9090
let mut next_should_line_break = true;
9191
let result = match *event.as_ref() {
92-
Event::StartText(ref e) => self.write(&e),
9392
Event::Start(ref e) => {
9493
let result = self.write_wrapped(b"<", e, b">");
9594
if let Some(i) = self.indent.as_mut() {

tests/test.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ fn fuzz_101() {
173173
fn test_no_trim() {
174174
let mut reader = Reader::from_str(" <tag> text </tag> ");
175175

176-
assert!(matches!(reader.read_event().unwrap(), StartText(_)));
176+
assert!(matches!(reader.read_event().unwrap(), Text(_)));
177177
assert!(matches!(reader.read_event().unwrap(), Start(_)));
178178
assert!(matches!(reader.read_event().unwrap(), Text(_)));
179179
assert!(matches!(reader.read_event().unwrap(), End(_)));
@@ -185,7 +185,7 @@ fn test_trim_end() {
185185
let mut reader = Reader::from_str(" <tag> text </tag> ");
186186
reader.trim_text_end(true);
187187

188-
assert!(matches!(reader.read_event().unwrap(), StartText(_)));
188+
assert!(matches!(reader.read_event().unwrap(), Text(_)));
189189
assert!(matches!(reader.read_event().unwrap(), Start(_)));
190190
assert!(matches!(reader.read_event().unwrap(), Text(_)));
191191
assert!(matches!(reader.read_event().unwrap(), End(_)));

0 commit comments

Comments
 (0)