Skip to content

Commit 87d241a

Browse files
authored
Merge pull request #455 from Mingun/read-text
Implement `read_text` - a method that returns a text between two tags
2 parents 2bf2d2d + 792d23d commit 87d241a

File tree

10 files changed

+240
-98
lines changed

10 files changed

+240
-98
lines changed

Changelog.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
- [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
4141
under the `quick-xml::encoding` namespace.
4242
- [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
43+
- [#455]: Change return type of all `read_to_end*` methods to return a span between tags
44+
- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags
4345

4446

4547
### Bug Fixes
@@ -139,6 +141,7 @@
139141
|`*_with_custom_entities`|`*_with`
140142
|`BytesText::unescaped()`|`BytesText::unescape()`
141143
|`Attribute::unescaped_*`|`Attribute::unescape_*`
144+
- [#329]: Also, that functions now borrow from the input instead of event / attribute
142145

143146
- [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
144147
added to all events
@@ -181,6 +184,8 @@
181184
- [#440]: Removed `Deserializer::from_slice` and `quick_xml::de::from_slice` methods because deserializing from a byte
182185
array cannot guarantee borrowing due to possible copying while decoding.
183186

187+
- [#455]: Removed `Reader::read_text_into` which is only not a better wrapper over match on `Event::Text`
188+
184189
### New Tests
185190

186191
- [#9]: Added tests for incorrect nested tags in input
@@ -199,6 +204,7 @@
199204
[#180]: https://github.com/tafia/quick-xml/issues/180
200205
[#191]: https://github.com/tafia/quick-xml/issues/191
201206
[#324]: https://github.com/tafia/quick-xml/issues/324
207+
[#329]: https://github.com/tafia/quick-xml/issues/329
202208
[#363]: https://github.com/tafia/quick-xml/issues/363
203209
[#387]: https://github.com/tafia/quick-xml/pull/387
204210
[#391]: https://github.com/tafia/quick-xml/pull/391
@@ -220,6 +226,7 @@
220226
[#440]: https://github.com/tafia/quick-xml/pull/440
221227
[#443]: https://github.com/tafia/quick-xml/pull/443
222228
[#450]: https://github.com/tafia/quick-xml/pull/450
229+
[#455]: https://github.com/tafia/quick-xml/pull/455
223230

224231

225232
## 0.23.0 -- 2022-05-08

examples/read_texts.rs

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
fn main() {
22
use quick_xml::events::Event;
3-
use quick_xml::name::QName;
43
use quick_xml::Reader;
54

65
let xml = "<tag1>text1</tag1><tag1>text2</tag1>\
@@ -9,23 +8,18 @@ fn main() {
98
let mut reader = Reader::from_str(xml);
109
reader.trim_text(true);
1110

12-
let mut txt = Vec::new();
13-
let mut buf = Vec::new();
14-
1511
loop {
16-
match reader.read_event_into(&mut buf) {
17-
Ok(Event::Start(ref e)) if e.name().as_ref() == b"tag2" => {
18-
txt.push(
19-
reader
20-
.read_text_into(QName(b"tag2"), &mut Vec::new())
21-
.expect("Cannot decode text value"),
22-
);
12+
match reader.read_event() {
13+
Ok(Event::Start(e)) if e.name().as_ref() == b"tag2" => {
14+
// read_text_into for buffered readers not implemented
15+
let txt = reader
16+
.read_text(e.name())
17+
.expect("Cannot decode text value");
2318
println!("{:?}", txt);
2419
}
2520
Ok(Event::Eof) => break, // exits the loop when reaching end of file
2621
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
2722
_ => (), // There are several other `Event`s we do not consider here
2823
}
29-
buf.clear();
3024
}
3125
}

src/de/mod.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -951,7 +951,8 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
951951
fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
952952
match self.reader.read_to_end_into(name, &mut self.buf) {
953953
Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof),
954-
other => Ok(other?),
954+
Err(e) => Err(e.into()),
955+
Ok(_) => Ok(()),
955956
}
956957
}
957958

@@ -991,7 +992,8 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
991992
fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
992993
match self.reader.read_to_end(name) {
993994
Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof),
994-
other => Ok(other?),
995+
Err(e) => Err(e.into()),
996+
Ok(_) => Ok(()),
995997
}
996998
}
997999

src/events/attributes.rs

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ impl<'a> Attribute<'a> {
4141
///
4242
/// This method is available only if `encoding` feature is **not** enabled.
4343
#[cfg(any(doc, not(feature = "encoding")))]
44-
pub fn unescape_value(&self) -> XmlResult<Cow<str>> {
44+
pub fn unescape_value(&self) -> XmlResult<Cow<'a, str>> {
4545
self.unescape_value_with(|_| None)
4646
}
4747

@@ -61,19 +61,26 @@ impl<'a> Attribute<'a> {
6161
pub fn unescape_value_with<'entity>(
6262
&self,
6363
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
64-
) -> XmlResult<Cow<str>> {
64+
) -> XmlResult<Cow<'a, str>> {
6565
// from_utf8 should never fail because content is always UTF-8 encoded
66-
Ok(unescape_with(
67-
std::str::from_utf8(&self.value)?,
68-
resolve_entity,
69-
)?)
66+
let decoded = match &self.value {
67+
Cow::Borrowed(bytes) => Cow::Borrowed(std::str::from_utf8(bytes)?),
68+
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
69+
Cow::Owned(bytes) => Cow::Owned(std::str::from_utf8(bytes)?.to_string()),
70+
};
71+
72+
match unescape_with(&decoded, resolve_entity)? {
73+
// Because result is borrowed, no replacements was done and we can use original string
74+
Cow::Borrowed(_) => Ok(decoded),
75+
Cow::Owned(s) => Ok(s.into()),
76+
}
7077
}
7178

7279
/// Decodes then unescapes the value.
7380
///
7481
/// This will allocate if the value contains any escape sequences or in
7582
/// non-UTF-8 encoding.
76-
pub fn decode_and_unescape_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<str>> {
83+
pub fn decode_and_unescape_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<'a, str>> {
7784
self.decode_and_unescape_value_with(reader, |_| None)
7885
}
7986

@@ -85,8 +92,12 @@ impl<'a> Attribute<'a> {
8592
&self,
8693
reader: &Reader<B>,
8794
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
88-
) -> XmlResult<Cow<str>> {
89-
let decoded = reader.decoder().decode(&*self.value)?;
95+
) -> XmlResult<Cow<'a, str>> {
96+
let decoded = match &self.value {
97+
Cow::Borrowed(bytes) => reader.decoder().decode(bytes)?,
98+
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
99+
Cow::Owned(bytes) => reader.decoder().decode(bytes)?.into_owned().into(),
100+
};
90101

91102
match unescape_with(&decoded, resolve_entity)? {
92103
// Because result is borrowed, no replacements was done and we can use original string

src/events/mod.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,7 @@ impl<'a> BytesText<'a> {
732732
///
733733
/// This will allocate if the value contains any escape sequences or in
734734
/// non-UTF-8 encoding.
735-
pub fn unescape(&self) -> Result<Cow<str>> {
735+
pub fn unescape(&self) -> Result<Cow<'a, str>> {
736736
self.unescape_with(|_| None)
737737
}
738738

@@ -743,8 +743,12 @@ impl<'a> BytesText<'a> {
743743
pub fn unescape_with<'entity>(
744744
&self,
745745
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
746-
) -> Result<Cow<str>> {
747-
let decoded = self.decoder.decode(&*self)?;
746+
) -> Result<Cow<'a, str>> {
747+
let decoded = match &self.content {
748+
Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
749+
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
750+
Cow::Owned(bytes) => self.decoder.decode(bytes)?.into_owned().into(),
751+
};
748752

749753
match unescape_with(&decoded, resolve_entity)? {
750754
// Because result is borrowed, no replacements was done and we can use original string
@@ -754,11 +758,9 @@ impl<'a> BytesText<'a> {
754758
}
755759

756760
/// Gets content of this text buffer in the specified encoding and optionally
757-
/// unescapes it. Unlike [`Self::unescape`] & Co., the lifetime
758-
/// of the returned `Cow` is bound to the original buffer / input
761+
/// unescapes it.
759762
#[cfg(feature = "serialize")]
760763
pub(crate) fn decode(&self, unescape: bool) -> Result<Cow<'a, str>> {
761-
//TODO: too many copies, can be optimized
762764
let text = match &self.content {
763765
Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
764766
// Convert to owned, because otherwise Cow will be bound with wrong lifetime

src/reader/async_tokio.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ use tokio::io::{self, AsyncBufRead, AsyncBufReadExt};
99
use crate::events::Event;
1010
use crate::name::{QName, ResolveResult};
1111
use crate::reader::buffered_reader::impl_buffered_source;
12-
use crate::reader::{is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader};
12+
use crate::reader::{
13+
is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span,
14+
};
1315
use crate::{Error, Result};
1416

1517
/// A struct for read XML asynchronously from an [`AsyncBufRead`].
@@ -125,7 +127,7 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
125127
/// // First, we read a start event...
126128
/// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start));
127129
///
128-
/// //...then, we could skip all events to the corresponding end event.
130+
/// // ...then, we could skip all events to the corresponding end event.
129131
/// // This call will correctly handle nested <outer> elements.
130132
/// // Note, however, that this method does not handle namespaces.
131133
/// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap();
@@ -142,8 +144,8 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
142144
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
143145
end: QName<'n>,
144146
buf: &mut Vec<u8>,
145-
) -> Result<()> {
146-
read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await)
147+
) -> Result<Span> {
148+
Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
147149
}
148150

149151
/// Read until '<' is found and moves reader to an `Opened` state.
@@ -275,7 +277,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
275277
/// (ResolveResult::Bound(ns), Event::Start(start))
276278
/// );
277279
///
278-
/// //...then, we could skip all events to the corresponding end event.
280+
/// // ...then, we could skip all events to the corresponding end event.
279281
/// // This call will correctly handle nested <outer> elements.
280282
/// // Note, however, that this method does not handle namespaces.
281283
/// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap();
@@ -295,7 +297,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
295297
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
296298
end: QName<'n>,
297299
buf: &mut Vec<u8>,
298-
) -> Result<()> {
300+
) -> Result<Span> {
299301
// According to the https://www.w3.org/TR/xml11/#dt-etag, end name should
300302
// match literally the start name. See `Reader::check_end_names` documentation
301303
self.reader.read_to_end_into_async(end, buf).await

src/reader/buffered_reader.rs

Lines changed: 10 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use memchr;
1010
use crate::errors::{Error, Result};
1111
use crate::events::Event;
1212
use crate::name::QName;
13-
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource};
13+
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
1414

1515
macro_rules! impl_buffered_source {
1616
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
@@ -277,6 +277,10 @@ impl<R: BufRead> Reader<R> {
277277
/// storage for events content. This function is supposed to be called after
278278
/// you already read a [`Start`] event.
279279
///
280+
/// Returns a span that cover content between `>` of an opening tag and `<` of
281+
/// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
282+
/// this method was called after reading expanded [`Start`] event.
283+
///
280284
/// Manages nested cases where parent and child elements have the same name.
281285
///
282286
/// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
@@ -340,7 +344,7 @@ impl<R: BufRead> Reader<R> {
340344
/// // First, we read a start event...
341345
/// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
342346
///
343-
/// //...then, we could skip all events to the corresponding end event.
347+
/// // ...then, we could skip all events to the corresponding end event.
344348
/// // This call will correctly handle nested <outer> elements.
345349
/// // Note, however, that this method does not handle namespaces.
346350
/// reader.read_to_end_into(end.name(), &mut buf).unwrap();
@@ -353,60 +357,13 @@ impl<R: BufRead> Reader<R> {
353357
/// [`End`]: Event::End
354358
/// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
355359
/// [`read_to_end()`]: Self::read_to_end
360+
/// [`expand_empty_elements`]: Self::expand_empty_elements
356361
/// [`check_end_names`]: Self::check_end_names
357362
/// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
358-
pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<()> {
359-
read_to_end!(self, end, buf, read_event_impl, {
363+
pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
364+
Ok(read_to_end!(self, end, buf, read_event_impl, {
360365
buf.clear();
361-
})
362-
}
363-
364-
/// Reads optional text between start and end tags.
365-
///
366-
/// If the next event is a [`Text`] event, returns the decoded and unescaped content as a
367-
/// `String`. If the next event is an [`End`] event, returns the empty string. In all other
368-
/// cases, returns an error.
369-
///
370-
/// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8
371-
/// if none is specified).
372-
///
373-
/// # Examples
374-
///
375-
/// ```
376-
/// # use pretty_assertions::assert_eq;
377-
/// use quick_xml::Reader;
378-
/// use quick_xml::events::Event;
379-
///
380-
/// let mut xml = Reader::from_reader(b"
381-
/// <a>&lt;b&gt;</a>
382-
/// <a></a>
383-
/// " as &[u8]);
384-
/// xml.trim_text(true);
385-
///
386-
/// let expected = ["<b>", ""];
387-
/// for &content in expected.iter() {
388-
/// match xml.read_event_into(&mut Vec::new()) {
389-
/// Ok(Event::Start(ref e)) => {
390-
/// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content);
391-
/// },
392-
/// e => panic!("Expecting Start event, found {:?}", e),
393-
/// }
394-
/// }
395-
/// ```
396-
///
397-
/// [`Text`]: Event::Text
398-
/// [`End`]: Event::End
399-
pub fn read_text_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<String> {
400-
let s = match self.read_event_into(buf) {
401-
Err(e) => return Err(e),
402-
403-
Ok(Event::Text(e)) => e.unescape()?.into_owned(),
404-
Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()),
405-
Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())),
406-
_ => return Err(Error::TextNotFound),
407-
};
408-
self.read_to_end_into(end, buf)?;
409-
Ok(s)
366+
}))
410367
}
411368
}
412369

src/reader/mod.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
33
#[cfg(feature = "encoding")]
44
use encoding_rs::Encoding;
5+
use std::ops::Range;
56

67
use crate::encoding::Decoder;
78
use crate::errors::{Error, Result};
@@ -238,16 +239,18 @@ macro_rules! read_to_end {
238239
$clear:block
239240
$(, $await:ident)?
240241
) => {{
242+
let start = $self.buffer_position();
241243
let mut depth = 0;
242244
loop {
243245
$clear
246+
let end = $self.buffer_position();
244247
match $self.$read_event($buf) $(.$await)? {
245248
Err(e) => return Err(e),
246249

247250
Ok(Event::Start(e)) if e.name() == $end => depth += 1,
248251
Ok(Event::End(e)) if e.name() == $end => {
249252
if depth == 0 {
250-
return Ok(());
253+
break start..end;
251254
}
252255
depth -= 1;
253256
}
@@ -270,6 +273,11 @@ mod slice_reader;
270273

271274
pub use ns_reader::NsReader;
272275

276+
/// Range of input in bytes, that corresponds to some piece of XML
277+
pub type Span = Range<usize>;
278+
279+
////////////////////////////////////////////////////////////////////////////////////////////////////
280+
273281
/// Possible reader states. The state transition diagram (`true` and `false` shows
274282
/// value of [`Reader::expand_empty_elements()`] option):
275283
///

0 commit comments

Comments
 (0)