From a9b335b2d7f5c3869f7ea71386af1617fd7a6c5d Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Mon, 30 Jun 2025 14:52:07 +0900 Subject: [PATCH 1/3] Add self-link insertion Insert s into elements with class="example", class="note", or class="XXX" that have id="" attributes. This matches Bikeshed's output. --- Cargo.lock | 295 +++++++++++++++++++++++++++++++++++++++++++++-- Cargo.toml | 1 + src/dom_utils.rs | 26 +++++ src/main.rs | 4 + src/self_link.rs | 210 +++++++++++++++++++++++++++++++++ 5 files changed, 527 insertions(+), 9 deletions(-) create mode 100644 src/self_link.rs diff --git a/Cargo.lock b/Cargo.lock index de566004..cb1d1f99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "aho-corasick" @@ -49,7 +49,18 @@ checksum = "4e018fccbeeb50ff26562ece792ed06659b9c2dae79ece77c4456bb10d9bf79b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.43", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.43", ] [[package]] @@ -82,6 +93,15 @@ dependencies = [ "instant", ] +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + [[package]] name = "futf" version = "0.1.5" @@ -128,6 +148,7 @@ dependencies = [ "regex", "tempfile", "tokio", + "url", ] [[package]] @@ -144,6 +165,113 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" + +[[package]] +name = "icu_properties" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "potential_utf", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "displaydoc", + "icu_locale_core", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + [[package]] name = "instant" version = "0.1.12" @@ -176,6 +304,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + [[package]] name = "lock_api" version = "0.4.10" @@ -286,6 +420,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + [[package]] name = "phf" version = "0.10.1" @@ -330,6 +470,15 @@ version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" +[[package]] +name = "potential_utf" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +dependencies = [ + "zerovec", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -344,9 +493,9 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "proc-macro2" -version = "1.0.60" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] @@ -459,9 +608,9 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "smallvec" -version = "1.10.0" +version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" [[package]] name = "socket2" @@ -473,6 +622,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "string_cache" version = "0.8.7" @@ -512,15 +667,26 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.18" +version = "2.0.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +checksum = "ee659fb5f3d355364e1f3e5bc10fb82068efbf824a1e9d1c9504244a6469ad53" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.43", +] + [[package]] name = "tempfile" version = "3.6.0" @@ -546,6 +712,16 @@ dependencies = [ "utf-8", ] +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tokio" version = "1.28.2" @@ -573,7 +749,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.43", ] [[package]] @@ -582,12 +758,29 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" +[[package]] +name = "url" +version = "2.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + [[package]] name = "utf-8" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -682,6 +875,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + [[package]] name = "xml5ever" version = "0.17.0" @@ -692,3 +891,81 @@ dependencies = [ "mac", "markup5ever", ] + +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.43", + "synstructure", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.43", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.43", +] diff --git a/Cargo.toml b/Cargo.toml index b298fe89..a339de12 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ html5ever = "0.26.0" markup5ever_rcdom = "0.2.0" regex = "1" delegate = "0.12.0" +url = "2.2.2" [dev-dependencies] tempfile = "3" diff --git a/src/dom_utils.rs b/src/dom_utils.rs index a7ca9cdc..1fd8de4c 100644 --- a/src/dom_utils.rs +++ b/src/dom_utils.rs @@ -37,6 +37,9 @@ pub trait NodeHandleExt { /// Returns true if the node is an element with the given class. fn has_class(&self, class: &str) -> bool; + /// Returns true if the node is an element with any of the given classes. + fn has_any_class(&self, classes: &[&str]) -> bool; + /// Returns true if the node is an element with the given ID. fn has_id(&self, id: &str) -> bool { const ID: QualName = QualName { @@ -59,6 +62,11 @@ pub trait NodeHandleExt { /// Appends children (without checking node type). fn append_children(&self, children: impl Iterator); + /// Prepends a single child to the node's children. + fn prepend_child(&self, child: Self) + where + Self: Sized; + /// Inserts children before the specified child. fn insert_children_before(&self, existing: &Self, new: impl Iterator); @@ -242,6 +250,17 @@ impl NodeHandleExt for Handle { .map_or(false, |v| v.split_ascii_whitespace().any(|c| c == class)) } + fn has_any_class(&self, classes: &[&str]) -> bool { + const CLASS: QualName = QualName { + prefix: None, + ns: ns!(), + local: local_name!("class"), + }; + self.get_attribute(&CLASS).map_or(false, |v| { + v.split_ascii_whitespace().any(|c| classes.contains(&c)) + }) + } + fn node_text(&self) -> Option { match &self.data { NodeData::Text { ref contents } => Some(contents.borrow().clone()), @@ -270,6 +289,13 @@ impl NodeHandleExt for Handle { })); } + fn prepend_child(&self, child: Handle) { + let mut children = self.children.borrow_mut(); + let old_parent = child.parent.replace(Some(Rc::downgrade(self))); + assert!(old_parent.is_none()); + children.insert(0, child); + } + fn insert_children_before(&self, existing: &Handle, new: impl Iterator) { let mut children = self.children.borrow_mut(); let i = children diff --git a/src/main.rs b/src/main.rs index 14ee3013..258654ac 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,6 +16,7 @@ mod io_utils; mod parser; mod rcdom_with_line_numbers; mod represents; +mod self_link; mod tag_omission; #[tokio::main] @@ -46,6 +47,7 @@ async fn run() -> io::Result<()> { let mut annotate_attributes = annotate_attributes::Processor::new(); let mut tag_omission = tag_omission::Processor::new(); let mut interface_index = interface_index::Processor::new(); + let mut self_link = self_link::Processor::new(); // We do exactly one pass to identify the changes that need to be made. dom_utils::scan_dom(&document, &mut |h| { @@ -54,6 +56,7 @@ async fn run() -> io::Result<()> { annotate_attributes.visit(h); tag_omission.visit(h); interface_index.visit(h); + self_link.visit(h); }); // And then we apply all of the changes. These different processors mostly @@ -64,6 +67,7 @@ async fn run() -> io::Result<()> { annotate_attributes.apply().await?; tag_omission.apply()?; interface_index.apply()?; + self_link.apply()?; // Finally, we write the result to standard out. let serializable: SerializableHandle = document.into(); diff --git a/src/self_link.rs b/src/self_link.rs new file mode 100644 index 00000000..de852272 --- /dev/null +++ b/src/self_link.rs @@ -0,0 +1,210 @@ +//! Inserts `` links for elements with `id` attributes and certain classes. + +use html5ever::tendril::StrTendril; +use html5ever::{QualName, local_name, namespace_url, ns}; +use markup5ever_rcdom::Handle; +use url::Url; + +use crate::dom_utils::NodeHandleExt; + +const TARGET_CLASSES: &[&str] = &["example", "note", "XXX"]; + +enum Edit { + InsertAsFirstChild(Handle, StrTendril), + InsertAfterSummary(Handle, StrTendril), +} + +pub struct Processor { + edits: Vec, +} + +impl Processor { + pub fn new() -> Self { + Self { edits: vec![] } + } + + pub fn visit(&mut self, node: &Handle) { + if !node.is_element() { + return; + } + + if !node.has_any_class(TARGET_CLASSES) { + return; + } + + if node.any_child(|c| c.has_class("self-link")) { + return; + } + + if let Some(id) = node.get_attribute(&QualName::new(None, ns!(), local_name!("id"))) { + if node.is_html_element(&local_name!("details")) { + self.edits.push(Edit::InsertAfterSummary(node.clone(), id)); + } else { + self.edits.push(Edit::InsertAsFirstChild(node.clone(), id)); + } + } + } + + pub fn apply(self) -> std::io::Result<()> { + for edit in self.edits { + match edit { + Edit::InsertAsFirstChild(node, id) => { + let link = create_self_link(&id); + node.prepend_child(link); + } + Edit::InsertAfterSummary(node, id) => { + let link = create_self_link(&id); + let summary = node + .children + .borrow() + .iter() + .find(|c| c.is_html_element(&local_name!("summary"))) + .cloned(); + + if let Some(summary) = summary { + let mut children = node.children.borrow_mut(); + let summary_pos = children + .iter() + .position(|c| std::rc::Rc::ptr_eq(c, &summary)) + .unwrap(); + children.insert(summary_pos + 1, link); + } else { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "details element with self-link target class has no summary", + )); + } + } + } + } + Ok(()) + } +} + +fn create_self_link(id: &str) -> Handle { + let mut url = Url::parse("https://html.spec.whatwg.org/multipage/").unwrap(); + url.set_fragment(Some(id)); + let href = url.fragment().unwrap_or(""); + + Handle::create_element(local_name!("a")) + .attribute(&local_name!("href"), format!("#{}", href)) + .attribute(&local_name!("class"), "self-link") + .build() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dom_utils; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_add_self_link() { + let document = parse_document_async( + r##" +
+
+
+
+
+"## + .as_bytes(), + ) + .await + .unwrap(); + + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + processor.apply().unwrap(); + + assert_eq!( + serialize_for_test(&[document]), + r##"
+
+
+
+
+"## + ); + } + + #[tokio::test] + async fn test_add_self_link_details() { + let document = parse_document_async( + r##" +
Foo
+"## + .as_bytes(), + ) + .await + .unwrap(); + + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + processor.apply().unwrap(); + + assert_eq!( + serialize_for_test(&[document]), + r##"
Foo
+"## + ); + } + + #[tokio::test] + async fn test_add_self_link_details_no_summary() { + let document = parse_document_async( + r##"
"##.as_bytes(), + ) + .await + .unwrap(); + + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + let result = processor.apply(); + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_add_self_link_already_present() { + let document = parse_document_async( + r##" +
+"## + .as_bytes(), + ) + .await + .unwrap(); + + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + processor.apply().unwrap(); + + assert_eq!( + serialize_for_test(&[document]), + r##"
+"## + ); + } + + #[tokio::test] + async fn test_url_encoding() { + let document = parse_document_async( + r##" +
+"## + .as_bytes(), + ) + .await + .unwrap(); + + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + processor.apply().unwrap(); + + assert_eq!( + serialize_for_test(&[document]), + r##"
+"## + ); + } +} From 703701fd99cf4df9b804344808753ef9a6a65c8d Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Mon, 30 Jun 2025 15:00:24 +0900 Subject: [PATCH 2/3] General Rust fixes and updates Update to Rust edition 2024, update dependency versions, and run Clippy to apply some suggested improvements. --- Cargo.lock | 44 ++++++++++++++++++++++------------ Cargo.toml | 4 ++-- Dockerfile | 2 +- ci-build/Dockerfile | 2 +- src/annotate_attributes.rs | 36 ++++++++++++++-------------- src/boilerplate.rs | 29 +++++++++++----------- src/dom_utils.rs | 15 ++++++------ src/interface_index.rs | 10 ++++---- src/main.rs | 4 ++-- src/parser.rs | 21 ++++++++-------- src/rcdom_with_line_numbers.rs | 4 ++-- src/represents.rs | 4 ++-- src/self_link.rs | 2 +- src/tag_omission.rs | 8 +++---- 14 files changed, 99 insertions(+), 86 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cb1d1f99..f90498d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,7 +49,7 @@ checksum = "4e018fccbeeb50ff26562ece792ed06659b9c2dae79ece77c4456bb10d9bf79b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn 2.0.104", ] [[package]] @@ -60,7 +60,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn 2.0.104", ] [[package]] @@ -502,9 +502,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.28" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] @@ -587,9 +587,23 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "serde" -version = "1.0.164" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.104", +] [[package]] name = "signal-hook-registry" @@ -667,9 +681,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.43" +version = "2.0.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee659fb5f3d355364e1f3e5bc10fb82068efbf824a1e9d1c9504244a6469ad53" +checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" dependencies = [ "proc-macro2", "quote", @@ -684,7 +698,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn 2.0.104", ] [[package]] @@ -749,14 +763,14 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn 2.0.104", ] [[package]] name = "unicode-ident" -version = "1.0.9" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "url" @@ -912,7 +926,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn 2.0.104", "synstructure", ] @@ -933,7 +947,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn 2.0.104", "synstructure", ] @@ -967,5 +981,5 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn 2.0.104", ] diff --git a/Cargo.toml b/Cargo.toml index a339de12..c2fd6d61 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "html-build" version = "0.0.0" publish = false -edition = "2021" +edition = "2024" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -12,7 +12,7 @@ html5ever = "0.26.0" markup5ever_rcdom = "0.2.0" regex = "1" delegate = "0.12.0" -url = "2.2.2" +url = "2.5.4" [dev-dependencies] tempfile = "3" diff --git a/Dockerfile b/Dockerfile index 773335e1..52611947 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.73-slim as builder +FROM rust:1.88-slim as builder WORKDIR /whatwg/html-build COPY Cargo.lock Cargo.toml ./ COPY src ./src/ diff --git a/ci-build/Dockerfile b/ci-build/Dockerfile index 4312d778..4a06e5d1 100644 --- a/ci-build/Dockerfile +++ b/ci-build/Dockerfile @@ -1,6 +1,6 @@ # This Dockerfile is just used to run on Travis CI in an environment that can easily and repeatedly # install our build dependencies. -FROM rust:1.73-slim as builder +FROM rust:1.88-slim as builder WORKDIR /whatwg/html-build COPY Cargo.lock Cargo.toml ./ COPY src ./src/ diff --git a/src/annotate_attributes.rs b/src/annotate_attributes.rs index 33a95bfe..afa155e2 100644 --- a/src/annotate_attributes.rs +++ b/src/annotate_attributes.rs @@ -5,7 +5,7 @@ use std::io; use std::rc::Rc; use html5ever::tendril::StrTendril; -use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; +use html5ever::{LocalName, QualName, local_name, namespace_url, ns}; use markup5ever_rcdom::{Handle, NodeData}; use crate::dom_utils::{self, NodeHandleExt}; @@ -142,7 +142,7 @@ impl Processor { let mut variant_comment = None; let mut variant_str = None; for node in description.iter() { - if let NodeData::Comment { ref contents } = node.data { + if let NodeData::Comment { contents } = &node.data { if contents.trim().starts_with("or:") { variant_comment = Some(node); variant_str = Some(StrTendril::from(contents.trim()[3..].trim_start())); @@ -158,7 +158,7 @@ impl Processor { .children .borrow() .iter() - .filter(|c| variant_comment.map_or(true, |vc| !Rc::ptr_eq(c, vc))) + .filter(|c| variant_comment.is_none_or(|vc| !Rc::ptr_eq(c, vc))) .map(|c| c.deep_clone()) .collect(), variant: variant_str, @@ -167,7 +167,7 @@ impl Processor { if existing.default.is_empty() { existing.default = descriptions.default; } else if !descriptions.default.is_empty() { - if let NodeData::Text { ref contents } = existing.default.last().unwrap().data { + if let NodeData::Text { contents } = &existing.default.last().unwrap().data { let mut borrow = contents.borrow_mut(); if let Some(last_non_ws) = borrow.rfind(|c: char| !c.is_ascii_whitespace()) { @@ -209,13 +209,13 @@ impl Processor { let mut has_special_semantics = false; let mut key = None; dom_utils::scan_dom(dd, &mut |n| match &n.data { - NodeData::Comment { ref contents } if contents.trim() == "no-annotate" => { + NodeData::Comment { contents } if contents.trim() == "no-annotate" => { can_annotate = false; } - NodeData::Comment { ref contents } if contents.trim() == "variant" => { + NodeData::Comment { contents } if contents.trim() == "variant" => { wants_variant_description = true; } - NodeData::Text { ref contents } + NodeData::Text { contents } if contents.borrow().contains("has special semantics") => { has_special_semantics = true; @@ -257,7 +257,7 @@ impl Processor { }; let mut description: Vec = match descriptions { Descriptions { - variant: Some(ref variant), + variant: Some(variant), .. } if wants_variant_description => { parser::parse_fragment_async(variant[..].as_bytes(), &dd).await? @@ -268,22 +268,22 @@ impl Processor { format!( "Attribute {key} wants variant description, but no was found" ), - )) - } - Descriptions { ref default, .. } => { - default.iter().map(|n| n.deep_clone()).collect() + )); } + Descriptions { default, .. } => default.iter().map(|n| n.deep_clone()).collect(), }; let mut dd_children = dd.children.borrow_mut(); if has_special_semantics { // Replace the trailing period with a separating colon. - if let Some(NodeData::Text { contents }) = dd_children.last_mut().map(|n| &n.data) { - let mut text = contents.borrow_mut(); - *text = StrTendril::from( - text.trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '.'), - ); - text.push_slice(": "); + if let Some(last) = dd_children.last_mut() { + if let NodeData::Text { contents } = &last.data { + let mut text = contents.borrow_mut(); + *text = StrTendril::from( + text.trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '.'), + ); + text.push_slice(": "); + } } } else { // Insert an em dash. diff --git a/src/boilerplate.rs b/src/boilerplate.rs index 5a4a6209..4e6ded28 100644 --- a/src/boilerplate.rs +++ b/src/boilerplate.rs @@ -7,7 +7,7 @@ use std::io; use std::path::{Path, PathBuf}; use html5ever::tendril::{self, SendTendril}; -use html5ever::{local_name, Attribute, LocalName, QualName}; +use html5ever::{Attribute, LocalName, QualName, local_name}; use markup5ever_rcdom::{Handle, NodeData}; use tokio::fs::File; use tokio::task::JoinHandle; @@ -48,11 +48,11 @@ impl Processor { /// Identifies replacements which will be needed, and starts the necessary /// I/O. pub fn visit(&mut self, node: &Handle) { - match node.data { + match &node.data { // BOILERPLATE comments will need to be replaced with their // corresponding HTML, parsed. Open the file so that we can do so on // demand. - NodeData::Comment { ref contents } if contents.starts_with("BOILERPLATE ") => { + NodeData::Comment { contents } if contents.starts_with("BOILERPLATE ") => { let path = Path::new(contents[12..].trim()); let file = if is_safe_path(path) { tokio::spawn(File::open(self.path.join(path))) @@ -67,12 +67,8 @@ impl Processor { // Pseudo-comments can also appear in element attributes. These are // not parsed as HTML, so we simply want to read them into memory so // they can be replaced. - NodeData::Element { ref attrs, .. } => { - for Attribute { - ref name, - ref value, - } in attrs.borrow().iter() - { + NodeData::Element { attrs, .. } => { + for Attribute { name, value } in attrs.borrow().iter() { if value.starts_with("") { let path = Path::new(value[16..value.len() - 3].trim()); let file_contents = if is_safe_path(path) { @@ -94,7 +90,7 @@ impl Processor { //
 and 
 which contain EXAMPLE also need to be
             // replaced, but as plain text. These are loaded from the "examples"
             // directory instead.
-            NodeData::Text { ref contents } => {
+            NodeData::Text { contents } => {
                 let borrowed_contents = contents.borrow();
                 let text = borrowed_contents.trim();
                 if !text.starts_with("EXAMPLE ") {
@@ -102,10 +98,10 @@ impl Processor {
                 }
                 const PRE: LocalName = local_name!("pre");
                 const CODE: LocalName = local_name!("code");
-                let has_suitable_parent = node.parent_node().map_or(false, |p| {
+                let has_suitable_parent = node.parent_node().is_some_and(|p| {
                     p.is_html_element(&PRE)
                         || (p.is_html_element(&CODE)
-                            && p.parent_node().map_or(false, |p2| p2.is_html_element(&PRE)))
+                            && p.parent_node().is_some_and(|p2| p2.is_html_element(&PRE)))
                 });
                 if has_suitable_parent {
                     let path = Path::new(text[8..].trim());
@@ -179,7 +175,8 @@ mod tests {
         proc.apply().await?;
         assert_eq!(
             serialize_for_test(&[document]),
-            "
enEnglish
"); + "
enEnglish
" + ); Ok(()) } @@ -200,7 +197,8 @@ mod tests { proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "hello"); + "hello" + ); Ok(()) } @@ -218,7 +216,8 @@ mod tests { proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "
first
second

EXAMPLE ignored

" ); + "
first
second

EXAMPLE ignored

" + ); Ok(()) } diff --git a/src/dom_utils.rs b/src/dom_utils.rs index 1fd8de4c..4bd3e4e7 100644 --- a/src/dom_utils.rs +++ b/src/dom_utils.rs @@ -2,7 +2,7 @@ use std::cell::RefCell; use std::rc::Rc; use html5ever::tendril::StrTendril; -use html5ever::{local_name, namespace_url, ns, Attribute, LocalName, QualName}; +use html5ever::{Attribute, LocalName, QualName, local_name, namespace_url, ns}; use markup5ever_rcdom::{Handle, Node, NodeData}; /// Extensions to the DOM interface to make manipulation more ergonimc. @@ -231,7 +231,7 @@ impl NodeHandleExt for Handle { name: QualName { ns: ns!(html), - ref local, + local, .. }, .. @@ -247,7 +247,7 @@ impl NodeHandleExt for Handle { local: local_name!("class"), }; self.get_attribute(&CLASS) - .map_or(false, |v| v.split_ascii_whitespace().any(|c| c == class)) + .is_some_and(|v| v.split_ascii_whitespace().any(|c| c == class)) } fn has_any_class(&self, classes: &[&str]) -> bool { @@ -256,14 +256,13 @@ impl NodeHandleExt for Handle { ns: ns!(), local: local_name!("class"), }; - self.get_attribute(&CLASS).map_or(false, |v| { - v.split_ascii_whitespace().any(|c| classes.contains(&c)) - }) + self.get_attribute(&CLASS) + .is_some_and(|v| v.split_ascii_whitespace().any(|c| classes.contains(&c))) } fn node_text(&self) -> Option { match &self.data { - NodeData::Text { ref contents } => Some(contents.borrow().clone()), + NodeData::Text { contents } => Some(contents.borrow().clone()), _ => None, } } @@ -271,7 +270,7 @@ impl NodeHandleExt for Handle { fn text_content(&self) -> StrTendril { let mut text = StrTendril::new(); scan_dom(self, &mut |n| { - if let NodeData::Text { ref contents } = &n.data { + if let NodeData::Text { contents } = &n.data { text.push_tendril(&contents.borrow()); } }); diff --git a/src/interface_index.rs b/src/interface_index.rs index b9bb2170..793b4bd1 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -5,7 +5,7 @@ use std::collections::BTreeMap; use std::io; use html5ever::tendril::StrTendril; -use html5ever::{local_name, namespace_url, ns, QualName}; +use html5ever::{QualName, local_name, namespace_url, ns}; use markup5ever_rcdom::Handle; use crate::dom_utils::NodeHandleExt; @@ -58,9 +58,9 @@ impl Processor { // attributes. if node.is_html_element(&local_name!("code")) && node.has_class("idl") - && node.parent_node().map_or(false, |p| { - p.is_html_element(&local_name!("pre")) && !p.has_class("extract") - }) + && node + .parent_node() + .is_some_and(|p| p.is_html_element(&local_name!("pre")) && !p.has_class("extract")) { let borrowed_children = node.children.borrow(); for window in borrowed_children.windows(2) { @@ -90,7 +90,7 @@ impl Processor { } } - if node.node_text().map_or(false, |t| t.contains(MARKER)) { + if node.node_text().is_some_and(|t| t.contains(MARKER)) { self.marker_nodes.push(node.clone()); } } diff --git a/src/main.rs b/src/main.rs index 258654ac..853403ed 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,4 @@ -use html5ever::serialize::{serialize, SerializeOpts}; +use html5ever::serialize::{SerializeOpts, serialize}; use std::borrow::Cow; use std::default::Default; use std::env; @@ -23,7 +23,7 @@ mod tag_omission; async fn main() -> io::Result<()> { // This gives slightly prettier error-printing. if let Err(e) = run().await { - eprintln!("{}", e); + eprintln!("{e}"); std::process::exit(1); } Ok(()) diff --git a/src/parser.rs b/src/parser.rs index 9c9feeb9..521eb4c3 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -131,9 +131,10 @@ pub(crate) mod tests { #[tokio::test] async fn test_document_error_line_number() -> io::Result<()> { - let result = - parse_document_async("Hello\nworld".as_bytes()) - .await; + let result = parse_document_async( + "Hello\nworld".as_bytes(), + ) + .await; let error = result.unwrap_err(); assert_eq!(error.kind(), io::ErrorKind::InvalidData); @@ -144,9 +145,7 @@ pub(crate) mod tests { #[tokio::test] async fn test_document_error_exact() -> io::Result<()> { - let result = - parse_document_async("&asdf;".as_bytes()) - .await; + let result = parse_document_async("&asdf;".as_bytes()).await; let error = result.unwrap_err(); assert_eq!(error.kind(), io::ErrorKind::InvalidData); @@ -160,8 +159,11 @@ pub(crate) mod tests { let document = parse_document_async("".as_bytes()).await?; let body = document.children.borrow()[1].children.borrow()[1].clone(); assert!(body.is_html_element(&local_name!("body"))); - let result = - parse_fragment_async("Hello \n\nworld".as_bytes(), &body).await; + let result = parse_fragment_async( + "Hello \n\nworld".as_bytes(), + &body, + ) + .await; let error = result.unwrap_err(); assert_eq!(error.kind(), io::ErrorKind::InvalidData); @@ -175,8 +177,7 @@ pub(crate) mod tests { let document = parse_document_async("".as_bytes()).await?; let body = document.children.borrow()[1].children.borrow()[1].clone(); assert!(body.is_html_element(&local_name!("body"))); - let result = - parse_fragment_async("&asdf;".as_bytes(), &body).await; + let result = parse_fragment_async("&asdf;".as_bytes(), &body).await; let error = result.unwrap_err(); assert_eq!(error.kind(), io::ErrorKind::InvalidData); diff --git a/src/rcdom_with_line_numbers.rs b/src/rcdom_with_line_numbers.rs index 3f5e6c24..2f8cb6cd 100644 --- a/src/rcdom_with_line_numbers.rs +++ b/src/rcdom_with_line_numbers.rs @@ -3,9 +3,9 @@ use delegate::delegate; use html5ever::interface::TreeSink; use html5ever::{ + Attribute, ExpandedName, QualName, tendril::StrTendril, tree_builder::{ElementFlags, NextParserState, NodeOrText, QuirksMode}, - Attribute, ExpandedName, QualName, }; use markup5ever_rcdom::{Handle, RcDom}; use std::borrow::Cow; @@ -33,7 +33,7 @@ impl RcDomWithLineNumbers { .join("\n"); Err(io::Error::new( io::ErrorKind::InvalidData, - format!("Parse errors encountered:\n\n{}", error_messages), + format!("Parse errors encountered:\n\n{error_messages}"), )) } else { Ok(()) diff --git a/src/represents.rs b/src/represents.rs index e357f41e..73b24d94 100644 --- a/src/represents.rs +++ b/src/represents.rs @@ -77,7 +77,7 @@ impl Processor { None => { return Err(io::Error::new( io::ErrorKind::InvalidData, - format!(" refers to unknown tag", tag), + format!(" refers to unknown tag"), )); } }; @@ -95,7 +95,7 @@ impl Processor { .map(|(index, sibling)| { let clone = sibling.deep_clone(); // Capitalize the first letter of the first node (which is expected to be text). - if let (0, NodeData::Text { ref contents }) = (index, &clone.data) { + if let (0, NodeData::Text { contents }) = (index, &clone.data) { contents.replace_with(|text| capitalize(text.trim_start())); } clone diff --git a/src/self_link.rs b/src/self_link.rs index de852272..2dfbab8d 100644 --- a/src/self_link.rs +++ b/src/self_link.rs @@ -87,7 +87,7 @@ fn create_self_link(id: &str) -> Handle { let href = url.fragment().unwrap_or(""); Handle::create_element(local_name!("a")) - .attribute(&local_name!("href"), format!("#{}", href)) + .attribute(&local_name!("href"), format!("#{href}")) .attribute(&local_name!("class"), "self-link") .build() } diff --git a/src/tag_omission.rs b/src/tag_omission.rs index 8676b19e..16df1b76 100644 --- a/src/tag_omission.rs +++ b/src/tag_omission.rs @@ -7,11 +7,11 @@ use std::collections::HashMap; use std::io; use html5ever::tendril::StrTendril; -use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; +use html5ever::{LocalName, QualName, local_name, namespace_url, ns}; use markup5ever_rcdom::{Handle, NodeData}; use regex::Regex; -use crate::dom_utils::{self, heading_level, NodeHandleExt}; +use crate::dom_utils::{self, NodeHandleExt, heading_level}; #[derive(Default)] struct ElementInfo { @@ -108,10 +108,10 @@ impl Processor { match (iter.next(), iter.next(), iter.next()) { (Some(a), Some(b), Some(c)) if a.node_text() - .map_or(false, |t| t.trim() == "A" || t.trim() == "An") + .is_some_and(|t| t.trim() == "A" || t.trim() == "An") && b.is_html_element(&local_name!("code")) && c.node_text() - .map_or(false, |t| t.trim().starts_with("element")) => + .is_some_and(|t| t.trim().starts_with("element")) => { let info = self.elements.entry(b.text_content()).or_default(); info.optional_tags_info.push(paragraph.clone()); From 65f6b0e38b15baed7371c4894e11d0fa68985117 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Mon, 30 Jun 2025 15:46:19 +0900 Subject: [PATCH 3/3] Run Rust checks on CI Also refactor CI into separate jobs that depend on each other. --- .github/workflows/build.yml | 55 +++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c9495cc3..f6e3ca1a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -11,21 +11,46 @@ env: jobs: - build: - name: Build + static-checks: + name: Static Checks + runs-on: ubuntu-latest + steps: + - name: Checkout whatwg/html-build + uses: actions/checkout@v3 + - name: Install Rust toolchain + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: rustfmt, clippy + - name: Cache Cargo dependencies + uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + - name: Check formatting + run: cargo fmt --check + - name: Run clippy + run: cargo clippy -- -D warnings + - name: Run tests + run: cargo test + - name: Shellcheck + run: | + shellcheck *.sh + shellcheck ci-build/*.sh + + build-and-test-image: + name: Build and Test Image runs-on: ubuntu-latest - permissions: - contents: read - packages: write steps: - name: Checkout whatwg/html-build uses: actions/checkout@v3 with: fetch-depth: 0 - - name: Shellcheck - run: | - shellcheck *.sh - shellcheck ci-build/*.sh - name: Docker build run: ci-build/docker-build.sh - name: Checkout whatwg/html @@ -38,15 +63,23 @@ jobs: run: | mkdir output bash ci-build/docker-run.sh "$GITHUB_WORKSPACE/html" output + + publish: + name: Publish + runs-on: ubuntu-latest + needs: [static-checks, build-and-test-image] + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + permissions: + contents: read + packages: write + steps: - name: Docker login - if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} uses: docker/login-action@v2 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Docker push - if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} run: | docker tag "$REGISTRY/$IMAGE_NAME" "$REGISTRY/$IMAGE_NAME:$GITHUB_SHA" docker tag "$REGISTRY/$IMAGE_NAME" "$REGISTRY/$IMAGE_NAME:latest"