Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
594 changes: 345 additions & 249 deletions Cargo.lock

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ edition = "2024"

[dependencies]
tokio = { version = "1", features = ["full"] }
html5ever = "0.26.0"
markup5ever_rcdom = "0.2.0"
html5ever = "0.35.0"
markup5ever_rcdom = "0.35.0"
regex = "1"
delegate = "0.12.0"
url = "2.5.4"
delegate = "0.13.4"
url = "2"

[dev-dependencies]
tempfile = "3"
28 changes: 14 additions & 14 deletions src/annotate_attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::io;
use std::rc::Rc;

use html5ever::tendril::StrTendril;
use html5ever::{LocalName, QualName, local_name, namespace_url, ns};
use html5ever::{LocalName, QualName, local_name, ns};
use markup5ever_rcdom::{Handle, NodeData};

use crate::dom_utils::{self, NodeHandleExt};
Expand Down Expand Up @@ -142,11 +142,11 @@ impl Processor {
let mut variant_comment = None;
let mut variant_str = None;
for node in description.iter() {
if let NodeData::Comment { contents } = &node.data {
if contents.trim().starts_with("or:") {
variant_comment = Some(node);
variant_str = Some(StrTendril::from(contents.trim()[3..].trim_start()));
}
if let NodeData::Comment { contents } = &node.data
&& contents.trim().starts_with("or:")
{
variant_comment = Some(node);
variant_str = Some(StrTendril::from(contents.trim()[3..].trim_start()));
}
}

Expand Down Expand Up @@ -276,14 +276,14 @@ impl Processor {
let mut dd_children = dd.children.borrow_mut();
if has_special_semantics {
// Replace the trailing period with a separating colon.
if let Some(last) = dd_children.last_mut() {
if let NodeData::Text { contents } = &last.data {
let mut text = contents.borrow_mut();
*text = StrTendril::from(
text.trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '.'),
);
text.push_slice(": ");
}
if let Some(last) = dd_children.last_mut()
&& let NodeData::Text { contents } = &last.data
{
let mut text = contents.borrow_mut();
*text = StrTendril::from(
text.trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '.'),
);
text.push_slice(": ");
}
} else {
// Insert an em dash.
Expand Down
7 changes: 3 additions & 4 deletions src/dom_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::cell::RefCell;
use std::rc::Rc;

use html5ever::tendril::StrTendril;
use html5ever::{Attribute, LocalName, QualName, local_name, namespace_url, ns};
use html5ever::{Attribute, LocalName, QualName, local_name, ns};
use markup5ever_rcdom::{Handle, Node, NodeData};

/// Extensions to the DOM interface to make manipulation more ergonimc.
Expand Down Expand Up @@ -137,10 +137,9 @@ pub fn scan_dom<F: FnMut(&Handle)>(handle: &Handle, f: &mut F) {
template_contents: ref tc,
..
} = handle.data
&& let Some(ref tc_handle) = *tc.borrow()
{
if let Some(ref tc_handle) = *tc.borrow() {
scan_dom(tc_handle, f);
}
scan_dom(tc_handle, f);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/interface_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::collections::BTreeMap;
use std::io;

use html5ever::tendril::StrTendril;
use html5ever::{QualName, local_name, namespace_url, ns};
use html5ever::{QualName, local_name, ns};
use markup5ever_rcdom::Handle;

use crate::dom_utils::NodeHandleExt;
Expand Down
20 changes: 20 additions & 0 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ pub async fn parse_fragment_async<R: AsyncRead + Unpin>(
RcDomWithLineNumbers::default(),
create_error_opts(),
context.clone(),
false,
None,
);

Expand Down Expand Up @@ -143,6 +144,25 @@ pub(crate) mod tests {
Ok(())
}

// See https://github.com/whatwg/html-build/issues/301
#[tokio::test]
async fn test_document_error_line_number_pre() -> io::Result<()> {
let result = parse_document_async(
r##"<!DOCTYPE html>
<pre>h1&gt;
</pre>
<p>Test 2</span>"##
.as_bytes(),
)
.await;

let error = result.unwrap_err();
assert_eq!(error.kind(), io::ErrorKind::InvalidData);
assert!(error.to_string().contains("Line 4: "));

Ok(())
}

#[tokio::test]
async fn test_document_error_exact() -> io::Result<()> {
let result = parse_document_async("<!DOCTYPE html>&asdf;".as_bytes()).await;
Expand Down
54 changes: 28 additions & 26 deletions src/rcdom_with_line_numbers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@ use html5ever::interface::TreeSink;
use html5ever::{
Attribute, ExpandedName, QualName,
tendril::StrTendril,
tree_builder::{ElementFlags, NextParserState, NodeOrText, QuirksMode},
tree_builder::{ElementFlags, NodeOrText, QuirksMode},
};
use markup5ever_rcdom::{Handle, RcDom};
use std::borrow::Cow;
use std::cell::Cell;
use std::io;

pub struct RcDomWithLineNumbers {
dom: RcDom,
current_line: u64,
current_line: Cell<u64>,
}

impl RcDomWithLineNumbers {
Expand All @@ -23,10 +24,11 @@ impl RcDomWithLineNumbers {
}

pub fn create_error_from_parse_errors(&self) -> io::Result<()> {
if !self.dom.errors.is_empty() {
if !self.dom.errors.borrow().is_empty() {
let error_messages = self
.dom
.errors
.borrow()
.iter()
.map(|e| e.to_string())
.collect::<Vec<String>>()
Expand All @@ -45,7 +47,7 @@ impl Default for RcDomWithLineNumbers {
fn default() -> Self {
Self {
dom: RcDom::default(),
current_line: 1,
current_line: Cell::new(1),
}
}
}
Expand All @@ -54,15 +56,17 @@ impl TreeSink for RcDomWithLineNumbers {
type Output = RcDomWithLineNumbers;
type Handle = <RcDom as TreeSink>::Handle;

type ElemName<'a> = <RcDom as TreeSink>::ElemName<'a>;

// Override the parse_error method to add line numbers to the error messages.
fn parse_error(&mut self, msg: Cow<'static, str>) {
let msg_with_line = format!("Line {}: {}", self.current_line, msg);
fn parse_error(&self, msg: Cow<'static, str>) {
let msg_with_line = format!("Line {}: {}", self.current_line.get(), msg);
self.dom.parse_error(Cow::Owned(msg_with_line));
}

// Override to track the current line number.
fn set_current_line(&mut self, line: u64) {
self.current_line = line;
fn set_current_line(&self, line: u64) {
self.current_line.set(line);
}

// Override to return RcDomWithLineNumbers instead of RcDom.
Expand All @@ -73,69 +77,67 @@ impl TreeSink for RcDomWithLineNumbers {
// Delegate all other methods to RcDom.
delegate! {
to self.dom {
fn get_document(&mut self) -> Self::Handle;
fn get_document(&self) -> Self::Handle;

fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> ExpandedName<'a>;

fn create_element(
&mut self,
&self,
name: QualName,
attrs: Vec<Attribute>,
flags: ElementFlags,
) -> Self::Handle;

fn create_comment(&mut self, text: StrTendril) -> Self::Handle;
fn create_comment(&self, text: StrTendril) -> Self::Handle;

fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle;
fn create_pi(&self, target: StrTendril, data: StrTendril) -> Self::Handle;

fn append(&mut self, parent: &Self::Handle, child: NodeOrText<Self::Handle>);
fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>);

fn append_based_on_parent_node(
&mut self,
&self,
element: &Self::Handle,
prev_element: &Self::Handle,
child: NodeOrText<Self::Handle>,
);

fn append_doctype_to_document(
&mut self,
&self,
name: StrTendril,
public_id: StrTendril,
system_id: StrTendril,
);

fn mark_script_already_started(&mut self, node: &Self::Handle);
fn mark_script_already_started(&self, node: &Self::Handle);

fn pop(&mut self, node: &Self::Handle);
fn pop(&self, node: &Self::Handle);

fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle;
fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle;

fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool;

fn set_quirks_mode(&mut self, mode: QuirksMode);
fn set_quirks_mode(&self, mode: QuirksMode);

fn append_before_sibling(
&mut self,
&self,
sibling: &Self::Handle,
new_node: NodeOrText<Self::Handle>,
);

fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec<Attribute>);
fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>);

fn associate_with_form(
&mut self,
&self,
target: &Self::Handle,
form: &Self::Handle,
nodes: (&Self::Handle, Option<&Self::Handle>),
);

fn remove_from_parent(&mut self, target: &Self::Handle);
fn remove_from_parent(&self, target: &Self::Handle);

fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle);
fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle);

fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool;

fn complete_script(&mut self, node: &Self::Handle) -> NextParserState;
}
}
}
2 changes: 1 addition & 1 deletion src/self_link.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Inserts `<a class="self-link" href="#ID">` links for elements with `id` attributes and certain classes.

use html5ever::tendril::StrTendril;
use html5ever::{QualName, local_name, namespace_url, ns};
use html5ever::{QualName, local_name, ns};
use markup5ever_rcdom::Handle;
use url::Url;

Expand Down
35 changes: 17 additions & 18 deletions src/tag_omission.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::collections::HashMap;
use std::io;

use html5ever::tendril::StrTendril;
use html5ever::{LocalName, QualName, local_name, namespace_url, ns};
use html5ever::{LocalName, QualName, local_name, ns};
use markup5ever_rcdom::{Handle, NodeData};
use regex::Regex;

Expand Down Expand Up @@ -67,19 +67,17 @@ impl Processor {
// If we encounter the Void elements section, look for the next dt.
if node.is_html_element(&local_name!("dfn"))
&& node.text_content().trim() == "Void elements"
{
if let Some(dt) = node
&& let Some(dt) = node
.parent_node()
.filter(|n| n.is_html_element(&local_name!("dt")))
{
for dd in dom_utils::dt_descriptions(&dt) {
dom_utils::scan_dom(&dd, &mut |n| {
if n.is_html_element(&local_name!("code")) {
let info = self.elements.entry(n.text_content()).or_default();
info.is_void_element = true;
}
});
}
{
for dd in dom_utils::dt_descriptions(&dt) {
dom_utils::scan_dom(&dd, &mut |n| {
if n.is_html_element(&local_name!("code")) {
let info = self.elements.entry(n.text_content()).or_default();
info.is_void_element = true;
}
});
}
}

Expand All @@ -91,12 +89,13 @@ impl Processor {
}

// If we see a <dl class="element">, record that.
if node.is_html_element(&local_name!("dl")) && node.has_class("element") {
if let Some(elem) = std::mem::take(&mut self.most_recent_element_dfn) {
let info = self.elements.entry(elem).or_default();
if info.dl.is_none() {
info.dl = Some(node.clone());
}
if node.is_html_element(&local_name!("dl"))
&& node.has_class("element")
&& let Some(elem) = std::mem::take(&mut self.most_recent_element_dfn)
{
let info = self.elements.entry(elem).or_default();
if info.dl.is_none() {
info.dl = Some(node.clone());
}
}
}
Expand Down
Loading