Skip to content

Commit ebed3e1

Browse files
committed
Implement anchor permanence checking
This adds a postprocess step, written in Rust. It looks for `<script type="text/required-ids">` elements listing all the required IDs, and errors out if those IDs are not found. (This cannot be done at preprocess time as many IDs are generated by Wattsi, e.g., from data-x="" attributes or from the TOC generation.) Closes #304.
1 parent e3d362b commit ebed3e1

File tree

4 files changed

+221
-11
lines changed

4 files changed

+221
-11
lines changed

build.sh

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -654,14 +654,7 @@ function processSource {
654654
$QUIET || echo "Pre-processing the source..."
655655
cp -p entities/out/entities.inc "$HTML_CACHE"
656656
cp -p entities/out/entities-dtd.url "$HTML_CACHE"
657-
if hash html-build 2>/dev/null; then
658-
html-build <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete"
659-
else
660-
local cargo_args=( --release )
661-
$VERBOSE && cargo_args+=( --verbose )
662-
$QUIET && cargo_args+=( --quiet )
663-
cargo run "${cargo_args[@]}" <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete"
664-
fi
657+
runRustTools <"$HTML_SOURCE/$source_location" >"$HTML_TEMP/source-whatwg-complete"
665658

666659
runWattsi "$HTML_TEMP/source-whatwg-complete" "$HTML_TEMP/wattsi-output"
667660
if [[ $WATTSI_RESULT == "0" ]]; then
@@ -690,7 +683,7 @@ function processSource {
690683

691684
if [[ $build_type == "default" ]]; then
692685
# Singlepage HTML
693-
mv "$HTML_TEMP/wattsi-output/index-html" "$HTML_OUTPUT/index.html"
686+
runRustTools --singlepage-post <"$HTML_TEMP/wattsi-output/index-html" >"$HTML_OUTPUT/index.html"
694687

695688
if [[ $SINGLE_PAGE_ONLY == "false" ]]; then
696689
# Singlepage Commit Snapshot
@@ -750,6 +743,22 @@ function checkWattsi {
750743
fi
751744
}
752745

746+
# Runs the Rust-based build tools, either with the version in $PATH or by using cargo to compile
747+
# them beforehand.
748+
# - Arguments: all arguments to pass to the tools
749+
# - Output: whatever the tools output
750+
function runRustTools {
751+
if hash html-build 2>/dev/null; then
752+
html-build "$@"
753+
else
754+
local cargo_args=( --release )
755+
$VERBOSE && cargo_args+=( --verbose )
756+
$QUIET && cargo_args+=( --quiet )
757+
cargo_args+=( -- )
758+
cargo run "${cargo_args[@]}" "$@"
759+
fi
760+
}
761+
753762
# Runs Wattsi on the given file, either locally or using the web service
754763
# - Arguments:
755764
# - $1: the file to run Wattsi on

src/anchor_permanence.rs

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
//! Postprocess step for ensuring anchor permanence: see
2+
//! https://whatwg.org/working-mode#anchors.
3+
//!
4+
//! Scans for the `<script type="text/required-ids">` element, which lists
5+
//! (whitespace-separated) IDs that must appear somewhere in the document.
6+
//! After verifying that all listed IDs are present, removes the script element.
7+
8+
use crate::dom_utils::NodeHandleExt;
9+
use html5ever::{QualName, local_name, ns};
10+
use markup5ever_rcdom::Handle;
11+
use std::collections::HashSet;
12+
13+
pub struct Processor {
14+
required_ids: HashSet<String>,
15+
script_node: Option<Handle>,
16+
}
17+
18+
impl Processor {
19+
pub fn new() -> Self {
20+
Self {
21+
required_ids: HashSet::new(),
22+
script_node: None,
23+
}
24+
}
25+
26+
pub fn visit(&mut self, node: &Handle) {
27+
// Capture and parse the <script type="text/required-ids"> element exactly once.
28+
if node.is_html_element(&local_name!("script")) {
29+
const TYPE: QualName = QualName {
30+
prefix: None,
31+
ns: ns!(),
32+
local: local_name!("type"),
33+
};
34+
if node.get_attribute(&TYPE).as_deref() == Some("text/required-ids") {
35+
assert!(
36+
self.script_node.is_none(),
37+
"multiple required-ids scripts encountered"
38+
);
39+
self.script_node = Some(node.clone());
40+
// Gather all text within the script and split on any ASCII whitespace.
41+
let content = node.text_content();
42+
for id_token in content.split_ascii_whitespace() {
43+
if !id_token.is_empty() {
44+
self.required_ids.insert(id_token.to_string());
45+
}
46+
}
47+
}
48+
}
49+
50+
// For elements with an id attribute, mark the ID as seen.
51+
if self.required_ids.is_empty() {
52+
return;
53+
}
54+
const ID_QN: QualName = QualName {
55+
prefix: None,
56+
ns: ns!(),
57+
local: local_name!("id"),
58+
};
59+
if let Some(id) = node.get_attribute(&ID_QN) {
60+
self.required_ids.remove(id.as_ref());
61+
}
62+
}
63+
64+
pub fn apply(self) -> std::io::Result<()> {
65+
if !self.required_ids.is_empty() {
66+
let mut missing: Vec<_> = self.required_ids.into_iter().collect();
67+
missing.sort();
68+
return Err(std::io::Error::new(
69+
std::io::ErrorKind::InvalidData,
70+
format!(
71+
"Missing required IDs for anchor permanence: {}",
72+
missing.join(", ")
73+
),
74+
));
75+
}
76+
77+
// Remove the script element (if present) after verification.
78+
if let Some(script) = self.script_node {
79+
script.remove();
80+
}
81+
Ok(())
82+
}
83+
}
84+
85+
#[cfg(test)]
86+
mod tests {
87+
use super::*;
88+
use crate::dom_utils;
89+
use crate::parser::{parse_document_async, tests::serialize_for_test};
90+
91+
#[tokio::test]
92+
async fn removes_script_from_head() {
93+
let document = parse_document_async(r#"<!DOCTYPE html>
94+
<html><head><script type="text/required-ids">a b c</script></head><body><div id="a"></div><p id="b"></p><section id="c"></section></body></html>
95+
"#.as_bytes()).await.unwrap();
96+
let mut processor = Processor::new();
97+
dom_utils::scan_dom(&document, &mut |h| processor.visit(h));
98+
processor.apply().unwrap();
99+
let serialized = serialize_for_test(&[document]);
100+
assert!(!serialized.contains("text/required-ids"));
101+
}
102+
103+
#[tokio::test]
104+
async fn no_script_present_noop() {
105+
let document = parse_document_async(
106+
r#"<!DOCTYPE html>
107+
<html><head></head><body></body></html>
108+
"#
109+
.as_bytes(),
110+
)
111+
.await
112+
.unwrap();
113+
let before = serialize_for_test(&[document.clone()]);
114+
let mut processor = Processor::new();
115+
dom_utils::scan_dom(&document, &mut |h| processor.visit(h));
116+
processor.apply().unwrap();
117+
assert_eq!(before, serialize_for_test(&[document]));
118+
}
119+
120+
#[tokio::test]
121+
async fn whitespace_splitting() {
122+
// Includes indentation, multiple spaces, and newlines in the script content.
123+
let document = parse_document_async(r#"<!DOCTYPE html><html><head><script type="text/required-ids">
124+
foo bar
125+
baz
126+
qux
127+
</script></head><body><div id="foo"></div><div id="bar"></div><div id="baz"></div><div id="qux"></div></body></html>
128+
"#.as_bytes()).await.unwrap();
129+
let mut processor = Processor::new();
130+
dom_utils::scan_dom(&document, &mut |h| processor.visit(h));
131+
processor.apply().unwrap();
132+
let serialized = serialize_for_test(&[document]);
133+
assert!(!serialized.contains("text/required-ids"));
134+
}
135+
136+
#[tokio::test]
137+
async fn errors_on_missing_ids() {
138+
let document = parse_document_async(r#"<!DOCTYPE html>
139+
<html><head><script type="text/required-ids">foo bar baz</script></head><body><div id="foo"></div></body></html>
140+
"#.as_bytes()).await.unwrap();
141+
let mut processor = Processor::new();
142+
dom_utils::scan_dom(&document, &mut |h| processor.visit(h));
143+
let err = processor.apply().expect_err("expected missing IDs error");
144+
assert!(
145+
err.to_string()
146+
.contains("Missing required IDs for anchor permanence: bar, baz")
147+
);
148+
}
149+
150+
#[tokio::test]
151+
#[should_panic(expected = "multiple required-ids scripts encountered")]
152+
async fn panics_on_multiple_required_ids_scripts() {
153+
let document = parse_document_async(r#"<!DOCTYPE html><html><head>
154+
<script type="text/required-ids">a b</script>
155+
<script type="text/required-ids">c d</script>
156+
</head><body><div id="a"></div><div id="b"></div><div id="c"></div><div id="d"></div></body></html>"#.as_bytes()).await.unwrap();
157+
let mut processor = Processor::new();
158+
dom_utils::scan_dom(&document, &mut |h| processor.visit(h));
159+
}
160+
}

src/dom_utils.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ pub trait NodeHandleExt {
7676
where
7777
Self: Sized;
7878

79+
/// Removes the node from its parent.
80+
fn remove(&self)
81+
where
82+
Self: Sized;
83+
7984
/// Clones the node and its entire subtree (including template contents).
8085
fn deep_clone(&self) -> Self;
8186

@@ -326,6 +331,10 @@ impl NodeHandleExt for Handle {
326331
self.parent.take();
327332
}
328333

334+
fn remove(&self) {
335+
self.replace_with(Vec::new());
336+
}
337+
329338
fn deep_clone(&self) -> Handle {
330339
use NodeData::*;
331340
let new_node_data = match &self.data {

src/main.rs

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use std::path::{Path, PathBuf};
88

99
use markup5ever_rcdom::SerializableHandle;
1010

11+
mod anchor_permanence;
1112
mod annotate_attributes;
1213
mod boilerplate;
1314
mod dom_utils;
@@ -21,15 +22,25 @@ mod tag_omission;
2122

2223
#[tokio::main]
2324
async fn main() -> io::Result<()> {
25+
let is_post = env::args().any(|a| a == "--singlepage-post");
26+
let result = if is_post {
27+
// --singlepage-post runs the postprocess phase, which is currently only meant to be used on the
28+
// singlepage output from Wattsi.
29+
run_postprocess().await
30+
} else {
31+
// By default we run the preprocess phase, which creates a new input for Wattsi.
32+
run_preprocess().await
33+
};
34+
2435
// This gives slightly prettier error-printing.
25-
if let Err(e) = run().await {
36+
if let Err(e) = result {
2637
eprintln!("{e}");
2738
std::process::exit(1);
2839
}
2940
Ok(())
3041
}
3142

32-
async fn run() -> io::Result<()> {
43+
async fn run_preprocess() -> io::Result<()> {
3344
// Since we're using Rc in the DOM implementation, we must ensure that tasks
3445
// which act on it are confined to this thread.
3546

@@ -79,6 +90,27 @@ async fn run() -> io::Result<()> {
7990
Ok(())
8091
}
8192

93+
// The steps and considerations here are similar to run_preprocess.
94+
async fn run_postprocess() -> io::Result<()> {
95+
let document = parser::parse_document_async(tokio::io::stdin()).await?;
96+
97+
let mut anchor_permanence = anchor_permanence::Processor::new();
98+
99+
dom_utils::scan_dom(&document, &mut |h| {
100+
anchor_permanence.visit(h);
101+
});
102+
103+
anchor_permanence.apply()?;
104+
105+
let serializable: SerializableHandle = document.into();
106+
serialize(
107+
&mut BufWriter::with_capacity(128 * 1024, io::stdout()),
108+
&serializable,
109+
SerializeOpts::default(),
110+
)?;
111+
Ok(())
112+
}
113+
82114
fn path_from_env<'a, V, D>(var: &V, default: &'a D) -> Cow<'a, Path>
83115
where
84116
V: AsRef<OsStr> + ?Sized,

0 commit comments

Comments
 (0)