Skip to content

Commit fff1186

Browse files
committed
Implement a decoding tokenizer
Signed-off-by: Simon Wülker <[email protected]>
1 parent 31a2c31 commit fff1186

File tree

17 files changed

+453
-37
lines changed

17 files changed

+453
-37
lines changed

Diff for: html5ever/Cargo.toml

+3
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@ readme = "../README.md"
1313
rust-version.workspace = true
1414

1515
[features]
16+
default = ["encoding"]
1617
trace_tokenizer = []
18+
encoding = ["dep:encoding_rs", "markup5ever/encoding"]
1719

1820
[dependencies]
1921
log = "0.4"
2022
mac = "0.1"
2123
markup5ever = { version = "0.15", path = "../markup5ever" }
2224
match_token = { workspace = true }
25+
encoding_rs = { version = "0.8", optional = true }
2326

2427
[dev-dependencies]
2528
criterion = "0.5"

Diff for: html5ever/examples/noop-tokenize.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ extern crate html5ever;
1414
use std::cell::RefCell;
1515
use std::io;
1616

17+
use markup5ever::buffer_queue::BufferQueue;
1718
use html5ever::tendril::*;
18-
use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
19+
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer};
1920

2021
/// In our case, our sink only contains a tokens vector
2122
struct Sink(RefCell<Vec<Token>>);

Diff for: html5ever/src/driver.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99

1010
//! High-level interface to the parser.
1111
12-
use crate::buffer_queue::BufferQueue;
12+
use markup5ever::buffer_queue::BufferQueue;
13+
1314
use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
1415
use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink};
1516
use crate::{Attribute, QualName};

Diff for: html5ever/src/tokenizer/char_ref/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
// except according to those terms.
99

1010
use super::{TokenSink, Tokenizer};
11-
use crate::buffer_queue::BufferQueue;
1211
use crate::data;
1312
use crate::tendril::StrTendril;
1413

1514
use log::debug;
1615
use mac::format_if;
16+
use markup5ever::buffer_queue::BufferQueue;
1717
use std::borrow::Cow::Borrowed;
1818
use std::char::from_u32;
1919

Diff for: html5ever/src/tokenizer/interface.rs

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ pub enum TokenSinkResult<Handle> {
7777
Script(Handle),
7878
Plaintext,
7979
RawData(states::RawKind),
80+
#[cfg(feature = "encoding")]
81+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
8082
}
8183

8284
/// Types which can receive tokens from the tokenizer.

Diff for: html5ever/src/tokenizer/mod.rs

+39-8
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,16 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
2222
use self::char_ref::{CharRef, CharRefTokenizer};
2323

2424
use crate::util::str::lower_ascii_letter;
25-
2625
use log::{debug, trace};
2726
use mac::format_if;
28-
use markup5ever::{namespace_url, ns, small_char_set};
27+
use markup5ever::buffer_queue::BufferQueue;
28+
use markup5ever::{namespace_url, ns, small_char_set, InputSink, InputSinkResult};
2929
use std::borrow::Cow::{self, Borrowed};
3030
use std::cell::{Cell, RefCell, RefMut};
3131
use std::collections::BTreeMap;
32-
use std::mem;
32+
use std::{iter, mem};
3333

34-
pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
34+
pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult};
3535
use crate::tendril::StrTendril;
3636
use crate::{Attribute, LocalName, QualName, SmallCharSet};
3737

@@ -43,13 +43,17 @@ pub enum ProcessResult<Handle> {
4343
Continue,
4444
Suspend,
4545
Script(Handle),
46+
#[cfg(feature = "encoding")]
47+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding)
4648
}
4749

4850
#[must_use]
4951
#[derive(Debug)]
5052
pub enum TokenizerResult<Handle> {
5153
Done,
5254
Script(Handle),
55+
#[cfg(feature = "encoding")]
56+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding)
5357
}
5458

5559
fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -364,6 +368,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
364368
ProcessResult::Continue => (),
365369
ProcessResult::Suspend => break,
366370
ProcessResult::Script(node) => return TokenizerResult::Script(node),
371+
#[cfg(feature = "encoding")]
372+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding),
367373
}
368374
}
369375
} else {
@@ -372,6 +378,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
372378
ProcessResult::Continue => (),
373379
ProcessResult::Suspend => break,
374380
ProcessResult::Script(node) => return TokenizerResult::Script(node),
381+
#[cfg(feature = "encoding")]
382+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding),
375383
}
376384
}
377385
}
@@ -452,6 +460,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
452460
self.state.set(states::RawData(kind));
453461
ProcessResult::Continue
454462
},
463+
#[cfg(feature = "encoding")]
464+
TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => ProcessResult::MaybeChangeEncodingAndStartOver(encoding)
455465
}
456466
}
457467

@@ -1455,6 +1465,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
14551465
ProcessResult::Continue => (),
14561466
ProcessResult::Suspend => break,
14571467
ProcessResult::Script(_) => unreachable!(),
1468+
#[cfg(feature = "encoding")]
1469+
ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(),
14581470
}
14591471
}
14601472

@@ -1582,13 +1594,34 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
15821594
}
15831595
}
15841596

1597+
impl<Sink> InputSink for Tokenizer<Sink>
1598+
where Sink: TokenSink {
1599+
type Handle = Sink::Handle;
1600+
1601+
fn feed(&self, input: &BufferQueue) -> impl Iterator<Item = InputSinkResult<Self::Handle>> {
1602+
iter::from_fn(|| {
1603+
self.feed(input).into()
1604+
})
1605+
}
1606+
}
1607+
1608+
impl<Handle> From<TokenizerResult<Handle>> for Option<InputSinkResult<Handle>> {
1609+
fn from(value: TokenizerResult<Handle>) -> Self {
1610+
match value {
1611+
TokenizerResult::Script(handle) => Some(InputSinkResult::HandleScript(handle)),
1612+
TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) => Some(InputSinkResult::MaybeStartOverWithEncoding(encoding)),
1613+
TokenizerResult::Done => None,
1614+
}
1615+
}
1616+
}
1617+
15851618
#[cfg(test)]
15861619
#[allow(non_snake_case)]
15871620
mod test {
15881621
use super::option_push; // private items
1589-
use crate::tendril::{SliceExt, StrTendril};
1590-
15911622
use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1623+
use crate::tendril::{SliceExt, StrTendril};
1624+
use crate::LocalName;
15921625

15931626
use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
15941627
use super::interface::{EndTag, StartTag, Tag, TagKind};
@@ -1597,8 +1630,6 @@ mod test {
15971630
use markup5ever::buffer_queue::BufferQueue;
15981631
use std::cell::RefCell;
15991632

1600-
use crate::LocalName;
1601-
16021633
// LinesMatch implements the TokenSink trait. It is used for testing to see
16031634
// if current_line is being updated when process_token is called. The lines
16041635
// vector is a collection of the line numbers that each token is on.

Diff for: html5ever/src/tree_builder/mod.rs

+4
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,10 @@ where
394394
assert!(more_tokens.is_empty());
395395
return tokenizer::TokenSinkResult::RawData(k);
396396
},
397+
#[cfg(feature = "encoding")]
398+
MaybeChangeEncodingAndStartOver(encoding) => {
399+
return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding);
400+
},
397401
}
398402
}
399403
}

Diff for: html5ever/src/tree_builder/rules.rs

+30-10
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,24 @@
1010
// The tree builder rules, as a single, enormous nested match expression.
1111

1212
use crate::interface::Quirks;
13-
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
13+
use crate::tokenizer::states::{Rawtext, Rcdata};
1414
use crate::tokenizer::TagKind::{EndTag, StartTag};
1515
use crate::tree_builder::tag_sets::*;
1616
use crate::tree_builder::types::*;
17-
use crate::tree_builder::{
18-
create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder,
19-
TreeSink,
20-
};
21-
use crate::QualName;
22-
use markup5ever::{expanded_name, local_name, namespace_url, ns};
17+
use crate::tree_builder::RawKind::ScriptData;
18+
use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink};
19+
20+
use markup5ever::interface::create_element;
21+
use markup5ever::interface::NodeOrText::AppendNode;
22+
use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName};
2323
use std::borrow::Cow::Borrowed;
2424

2525
use crate::tendril::SliceExt;
2626
use match_token::match_token;
2727

28+
#[cfg(feature = "encoding")]
29+
use encoding_rs::Encoding;
30+
2831
fn any_not_whitespace(x: &StrTendril) -> bool {
2932
// FIXME: this might be much faster as a byte scan
3033
x.chars().any(|c| !c.is_ascii_whitespace())
@@ -40,7 +43,11 @@ where
4043
Handle: Clone,
4144
Sink: TreeSink<Handle = Handle>,
4245
{
43-
pub(crate) fn step(&self, mode: InsertionMode, token: Token) -> ProcessResult<Handle> {
46+
pub(crate) fn step(
47+
&self,
48+
mode: InsertionMode,
49+
token: Token,
50+
) -> ProcessResult<Handle> {
4451
self.debug_step(mode, &token);
4552

4653
match mode {
@@ -113,8 +120,21 @@ where
113120

114121
<html> => self.step(InBody, token),
115122

116-
tag @ <base> <basefont> <bgsound> <link> <meta> => {
117-
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
123+
tag @ <meta> => {
124+
// FIXME: handle <meta http-equiv="Content-Type">
125+
#[cfg(feature = "encoding")]
126+
if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) {
127+
if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) {
128+
self.insert_and_pop_element_for(tag);
129+
return MaybeChangeEncodingAndStartOver(encoding);
130+
}
131+
}
132+
133+
self.insert_and_pop_element_for(tag);
134+
DoneAckSelfClosing
135+
},
136+
137+
tag @ <base> <basefont> <bgsound> <link> => {
118138
self.insert_and_pop_element_for(tag);
119139
DoneAckSelfClosing
120140
}

Diff for: html5ever/src/tree_builder/types.rs

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ pub(crate) enum ProcessResult<Handle> {
7777
Script(Handle),
7878
ToPlaintext,
7979
ToRawData(RawKind),
80+
#[cfg(feature = "encoding")]
81+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
8082
}
8183

8284
pub(crate) enum FormatEntry<Handle> {

Diff for: markup5ever/Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@ rust-version.workspace = true
1414
[lib]
1515
path = "lib.rs"
1616

17+
[features]
18+
encoding = ["dep:encoding_rs"]
19+
1720
[dependencies]
1821
string_cache = "0.8"
1922
phf = "0.11"
2023
tendril = "0.4"
2124
log = "0.4"
25+
encoding_rs = { version = "0.8", optional = true }
2226

2327
[build-dependencies]
2428
string_cache_codegen = "0.5.4"

0 commit comments

Comments
 (0)