Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -654,14 +654,7 @@ function processSource {
$QUIET || echo "Pre-processing the source..."
cp -p entities/out/entities.inc "$HTML_CACHE"
cp -p entities/out/entities-dtd.url "$HTML_CACHE"
if hash html-build 2>/dev/null; then
html-build <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete"
else
local cargo_args=( --release )
$VERBOSE && cargo_args+=( --verbose )
$QUIET && cargo_args+=( --quiet )
cargo run "${cargo_args[@]}" <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete"
fi
runRustTools <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete"

runWattsi "$HTML_TEMP/source-whatwg-complete" "$HTML_TEMP/wattsi-output"
if [[ $WATTSI_RESULT == "0" ]]; then
Expand Down Expand Up @@ -690,7 +683,7 @@ function processSource {

if [[ $build_type == "default" ]]; then
# Singlepage HTML
mv "$HTML_TEMP/wattsi-output/index-html" "$HTML_OUTPUT/index.html"
runRustTools --singlepage-post <"$HTML_TEMP/wattsi-output/index-html" >"$HTML_OUTPUT/index.html"

if [[ $SINGLE_PAGE_ONLY == "false" ]]; then
# Singlepage Commit Snapshot
Expand Down Expand Up @@ -750,6 +743,22 @@ function checkWattsi {
fi
}

# Runs the Rust-based build tools, either with the version in $PATH or by using cargo to compile
# them beforehand.
# - Arguments: all arguments to pass to the tools
# - Output: whatever the tools output
function runRustTools {
if hash html-build 2>/dev/null; then
html-build "$@"
else
local cargo_args=( --release )
$VERBOSE && cargo_args+=( --verbose )
$QUIET && cargo_args+=( --quiet )
cargo_args+=( -- )
cargo run "${cargo_args[@]}" "$@"
fi
}

# Runs Wattsi on the given file, either locally or using the web service
# - Arguments:
# - $1: the file to run Wattsi on
Expand Down
160 changes: 160 additions & 0 deletions src/anchor_permanence.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
//! Postprocess step for ensuring anchor permanence: see
//! https://whatwg.org/working-mode#anchors.
//!
//! Scans for the `<script type="text/required-ids">` element, which lists
//! (whitespace-separated) IDs that must appear somewhere in the document.
//! After verifying that all listed IDs are present, removes the script element.

use crate::dom_utils::NodeHandleExt;
use html5ever::{QualName, local_name, ns};
use markup5ever_rcdom::Handle;
use std::collections::HashSet;

pub struct Processor {
required_ids: HashSet<String>,
script_node: Option<Handle>,
}

impl Processor {
pub fn new() -> Self {
Self {
required_ids: HashSet::new(),
script_node: None,
}
}

pub fn visit(&mut self, node: &Handle) {
// Capture and parse the <script type="text/required-ids"> element exactly once.
if node.is_html_element(&local_name!("script")) {
const TYPE: QualName = QualName {
prefix: None,
ns: ns!(),
local: local_name!("type"),
};
if node.get_attribute(&TYPE).as_deref() == Some("text/required-ids") {
assert!(
self.script_node.is_none(),
"multiple required-ids scripts encountered"
);
self.script_node = Some(node.clone());
// Gather all text within the script and split on any ASCII whitespace.
let content = node.text_content();
for id_token in content.split_ascii_whitespace() {
if !id_token.is_empty() {
self.required_ids.insert(id_token.to_string());
}
}
}
}

// For elements with an id attribute, mark the ID as seen.
if self.required_ids.is_empty() {
return;
}
const ID_QN: QualName = QualName {
prefix: None,
ns: ns!(),
local: local_name!("id"),
};
if let Some(id) = node.get_attribute(&ID_QN) {
self.required_ids.remove(id.as_ref());
}
}

pub fn apply(self) -> std::io::Result<()> {
if !self.required_ids.is_empty() {
let mut missing: Vec<_> = self.required_ids.into_iter().collect();
missing.sort();
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!(
"Missing required IDs for anchor permanence: {}",
missing.join(", ")
),
));
}

// Remove the script element (if present) after verification.
if let Some(script) = self.script_node {
script.remove();
}
Ok(())
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::dom_utils;
use crate::parser::{parse_document_async, tests::serialize_for_test};

#[tokio::test]
async fn removes_script_from_head() {
let document = parse_document_async(r#"<!DOCTYPE html>
<html><head><script type="text/required-ids">a b c</script></head><body><div id="a"></div><p id="b"></p><section id="c"></section></body></html>
"#.as_bytes()).await.unwrap();
let mut processor = Processor::new();
dom_utils::scan_dom(&document, &mut |h| processor.visit(h));
processor.apply().unwrap();
let serialized = serialize_for_test(&[document]);
assert!(!serialized.contains("text/required-ids"));
}

#[tokio::test]
async fn no_script_present_noop() {
let document = parse_document_async(
r#"<!DOCTYPE html>
<html><head></head><body></body></html>
"#
.as_bytes(),
)
.await
.unwrap();
let before = serialize_for_test(&[document.clone()]);
let mut processor = Processor::new();
dom_utils::scan_dom(&document, &mut |h| processor.visit(h));
processor.apply().unwrap();
assert_eq!(before, serialize_for_test(&[document]));
}

#[tokio::test]
async fn whitespace_splitting() {
// Includes indentation, multiple spaces, and newlines in the script content.
let document = parse_document_async(r#"<!DOCTYPE html><html><head><script type="text/required-ids">
foo bar
baz
qux
</script></head><body><div id="foo"></div><div id="bar"></div><div id="baz"></div><div id="qux"></div></body></html>
"#.as_bytes()).await.unwrap();
let mut processor = Processor::new();
dom_utils::scan_dom(&document, &mut |h| processor.visit(h));
processor.apply().unwrap();
let serialized = serialize_for_test(&[document]);
assert!(!serialized.contains("text/required-ids"));
}

#[tokio::test]
async fn errors_on_missing_ids() {
let document = parse_document_async(r#"<!DOCTYPE html>
<html><head><script type="text/required-ids">foo bar baz</script></head><body><div id="foo"></div></body></html>
"#.as_bytes()).await.unwrap();
let mut processor = Processor::new();
dom_utils::scan_dom(&document, &mut |h| processor.visit(h));
let err = processor.apply().expect_err("expected missing IDs error");
assert!(
err.to_string()
.contains("Missing required IDs for anchor permanence: bar, baz")
);
}

#[tokio::test]
#[should_panic(expected = "multiple required-ids scripts encountered")]
async fn panics_on_multiple_required_ids_scripts() {
let document = parse_document_async(r#"<!DOCTYPE html><html><head>
<script type="text/required-ids">a b</script>
<script type="text/required-ids">c d</script>
</head><body><div id="a"></div><div id="b"></div><div id="c"></div><div id="d"></div></body></html>"#.as_bytes()).await.unwrap();
let mut processor = Processor::new();
dom_utils::scan_dom(&document, &mut |h| processor.visit(h));
}
}
9 changes: 9 additions & 0 deletions src/dom_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ pub trait NodeHandleExt {
where
Self: Sized;

/// Removes the node from its parent.
fn remove(&self)
where
Self: Sized;

/// Clones the node and its entire subtree (including template contents).
fn deep_clone(&self) -> Self;

Expand Down Expand Up @@ -326,6 +331,10 @@ impl NodeHandleExt for Handle {
self.parent.take();
}

fn remove(&self) {
self.replace_with(Vec::new());
}

fn deep_clone(&self) -> Handle {
use NodeData::*;
let new_node_data = match &self.data {
Expand Down
36 changes: 34 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::path::{Path, PathBuf};

use markup5ever_rcdom::SerializableHandle;

mod anchor_permanence;
mod annotate_attributes;
mod boilerplate;
mod dom_utils;
Expand All @@ -21,15 +22,25 @@ mod tag_omission;

#[tokio::main]
async fn main() -> io::Result<()> {
let is_post = env::args().any(|a| a == "--singlepage-post");
let result = if is_post {
// --singlepage-post runs the postprocess phase, which is currently only meant to be used on the
// singlepage output from Wattsi.
run_postprocess().await
} else {
// By default we run the preprocess phase, which creates a new input for Wattsi.
run_preprocess().await
};

// This gives slightly prettier error-printing.
if let Err(e) = run().await {
if let Err(e) = result {
eprintln!("{e}");
std::process::exit(1);
}
Ok(())
}

async fn run() -> io::Result<()> {
async fn run_preprocess() -> io::Result<()> {
// Since we're using Rc in the DOM implementation, we must ensure that tasks
// which act on it are confined to this thread.

Expand Down Expand Up @@ -79,6 +90,27 @@ async fn run() -> io::Result<()> {
Ok(())
}

// The steps and considerations here are similar to run_preprocess.
async fn run_postprocess() -> io::Result<()> {
let document = parser::parse_document_async(tokio::io::stdin()).await?;

let mut anchor_permanence = anchor_permanence::Processor::new();

dom_utils::scan_dom(&document, &mut |h| {
anchor_permanence.visit(h);
});

anchor_permanence.apply()?;

let serializable: SerializableHandle = document.into();
serialize(
&mut BufWriter::with_capacity(128 * 1024, io::stdout()),
&serializable,
SerializeOpts::default(),
)?;
Ok(())
}

fn path_from_env<'a, V, D>(var: &V, default: &'a D) -> Cow<'a, Path>
where
V: AsRef<OsStr> + ?Sized,
Expand Down
Loading