diff --git a/build.sh b/build.sh index 063f9f05..384ec269 100755 --- a/build.sh +++ b/build.sh @@ -654,14 +654,7 @@ function processSource { $QUIET || echo "Pre-processing the source..." cp -p entities/out/entities.inc "$HTML_CACHE" cp -p entities/out/entities-dtd.url "$HTML_CACHE" - if hash html-build 2>/dev/null; then - html-build <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete" - else - local cargo_args=( --release ) - $VERBOSE && cargo_args+=( --verbose ) - $QUIET && cargo_args+=( --quiet ) - cargo run "${cargo_args[@]}" <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete" - fi + runRustTools <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete" runWattsi "$HTML_TEMP/source-whatwg-complete" "$HTML_TEMP/wattsi-output" if [[ $WATTSI_RESULT == "0" ]]; then @@ -690,7 +683,7 @@ function processSource { if [[ $build_type == "default" ]]; then # Singlepage HTML - mv "$HTML_TEMP/wattsi-output/index-html" "$HTML_OUTPUT/index.html" + runRustTools --singlepage-post <"$HTML_TEMP/wattsi-output/index-html" >"$HTML_OUTPUT/index.html" if [[ $SINGLE_PAGE_ONLY == "false" ]]; then # Singlepage Commit Snapshot @@ -750,6 +743,22 @@ function checkWattsi { fi } +# Runs the Rust-based build tools, either with the version in $PATH or by using cargo to compile +# them beforehand. +# - Arguments: all arguments to pass to the tools +# - Output: whatever the tools output +function runRustTools { + if hash html-build 2>/dev/null; then + html-build "$@" + else + local cargo_args=( --release ) + $VERBOSE && cargo_args+=( --verbose ) + $QUIET && cargo_args+=( --quiet ) + cargo_args+=( -- ) + cargo run "${cargo_args[@]}" "$@" + fi +} + # Runs Wattsi on the given file, either locally or using the web service # - Arguments: # - $1: the file to run Wattsi on diff --git a/src/anchor_permanence.rs b/src/anchor_permanence.rs new file mode 100644 index 00000000..a8a1c144 --- /dev/null +++ b/src/anchor_permanence.rs @@ -0,0 +1,160 @@ +//! Postprocess step for ensuring anchor permanence: see +//! https://whatwg.org/working-mode#anchors. +//! +//! Scans for the `

+"#.as_bytes()).await.unwrap(); + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + processor.apply().unwrap(); + let serialized = serialize_for_test(&[document]); + assert!(!serialized.contains("text/required-ids")); + } + + #[tokio::test] + async fn no_script_present_noop() { + let document = parse_document_async( + r#" + +"# + .as_bytes(), + ) + .await + .unwrap(); + let before = serialize_for_test(&[document.clone()]); + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + processor.apply().unwrap(); + assert_eq!(before, serialize_for_test(&[document])); + } + + #[tokio::test] + async fn whitespace_splitting() { + // Includes indentation, multiple spaces, and newlines in the script content. + let document = parse_document_async(r#"
+"#.as_bytes()).await.unwrap(); + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + processor.apply().unwrap(); + let serialized = serialize_for_test(&[document]); + assert!(!serialized.contains("text/required-ids")); + } + + #[tokio::test] + async fn errors_on_missing_ids() { + let document = parse_document_async(r#" +
+"#.as_bytes()).await.unwrap(); + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + let err = processor.apply().expect_err("expected missing IDs error"); + assert!( + err.to_string() + .contains("Missing required IDs for anchor permanence: bar, baz") + ); + } + + #[tokio::test] + #[should_panic(expected = "multiple required-ids scripts encountered")] + async fn panics_on_multiple_required_ids_scripts() { + let document = parse_document_async(r#" + + +
"#.as_bytes()).await.unwrap(); + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + } +} diff --git a/src/dom_utils.rs b/src/dom_utils.rs index 6a788e8d..dffb9fcb 100644 --- a/src/dom_utils.rs +++ b/src/dom_utils.rs @@ -76,6 +76,11 @@ pub trait NodeHandleExt { where Self: Sized; + /// Removes the node from its parent. + fn remove(&self) + where + Self: Sized; + /// Clones the node and its entire subtree (including template contents). fn deep_clone(&self) -> Self; @@ -326,6 +331,10 @@ impl NodeHandleExt for Handle { self.parent.take(); } + fn remove(&self) { + self.replace_with(Vec::new()); + } + fn deep_clone(&self) -> Handle { use NodeData::*; let new_node_data = match &self.data { diff --git a/src/main.rs b/src/main.rs index 853403ed..a3ad9085 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,7 @@ use std::path::{Path, PathBuf}; use markup5ever_rcdom::SerializableHandle; +mod anchor_permanence; mod annotate_attributes; mod boilerplate; mod dom_utils; @@ -21,15 +22,25 @@ mod tag_omission; #[tokio::main] async fn main() -> io::Result<()> { + let is_post = env::args().any(|a| a == "--singlepage-post"); + let result = if is_post { + // --singlepage-post runs the postprocess phase, which is currently only meant to be used on the + // singlepage output from Wattsi. + run_postprocess().await + } else { + // By default we run the preprocess phase, which creates a new input for Wattsi. + run_preprocess().await + }; + // This gives slightly prettier error-printing. - if let Err(e) = run().await { + if let Err(e) = result { eprintln!("{e}"); std::process::exit(1); } Ok(()) } -async fn run() -> io::Result<()> { +async fn run_preprocess() -> io::Result<()> { // Since we're using Rc in the DOM implementation, we must ensure that tasks // which act on it are confined to this thread. @@ -79,6 +90,27 @@ async fn run() -> io::Result<()> { Ok(()) } +// The steps and considerations here are similar to run_preprocess. +async fn run_postprocess() -> io::Result<()> { + let document = parser::parse_document_async(tokio::io::stdin()).await?; + + let mut anchor_permanence = anchor_permanence::Processor::new(); + + dom_utils::scan_dom(&document, &mut |h| { + anchor_permanence.visit(h); + }); + + anchor_permanence.apply()?; + + let serializable: SerializableHandle = document.into(); + serialize( + &mut BufWriter::with_capacity(128 * 1024, io::stdout()), + &serializable, + SerializeOpts::default(), + )?; + Ok(()) +} + fn path_from_env<'a, V, D>(var: &V, default: &'a D) -> Cow<'a, Path> where V: AsRef + ?Sized,