use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char};
use crate::common::{Position, TextPosition, XmlVersion};
use crate::name::OwnedName;
use crate::namespace::NamespaceStack;
use crate::reader::config::ParserConfig2;
use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
use crate::reader::indexset::AttributesSet;
use crate::reader::lexer::{Lexer, Token};
use super::{Error, ErrorKind};
use std::collections::HashMap;
use std::io::Read;
macro_rules! gen_takes(
($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
$(
impl MarkupData {
#[inline]
#[allow(clippy::mem_replace_option_with_none)]
#[allow(clippy::mem_replace_with_default)]
fn $method(&mut self) -> $t {
std::mem::replace(&mut self.$field, $def)
}
}
)+
)
);
gen_takes!(
name -> take_name, String, String::new();
ref_data -> take_ref_data, String, String::new();
encoding -> take_encoding, Option<String>, None;
element_name -> take_element_name, Option<OwnedName>, None;
attr_name -> take_attr_name, Option<OwnedName>, None;
attributes -> take_attributes, AttributesSet, AttributesSet::new()
);
mod inside_cdata;
mod inside_closing_tag_name;
mod inside_comment;
mod inside_declaration;
mod inside_doctype;
mod inside_opening_tag;
mod inside_processing_instruction;
mod inside_reference;
mod outside_tag;
static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10;
static DEFAULT_STANDALONE: Option<bool> = None;
type ElementStack = Vec<OwnedName>;
pub type Result = super::Result<XmlEvent>;
pub(crate) struct PullParser {
config: ParserConfig2,
lexer: Lexer,
st: State,
state_after_reference: State,
buf: String,
entities: HashMap<String, String>,
nst: NamespaceStack,
data: MarkupData,
final_result: Option<Result>,
next_event: Option<Result>,
est: ElementStack,
pos: Vec<TextPosition>,
encountered: Encountered,
inside_whitespace: bool,
read_prefix_separator: bool,
pop_namespace: bool,
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
enum Encountered {
None = 0,
AnyChars, Declaration,
Comment,
Doctype,
Element,
}
impl PullParser {
#[inline]
pub fn new(config: impl Into<ParserConfig2>) -> PullParser {
let config = config.into();
Self::new_with_config2(config)
}
#[inline]
fn new_with_config2(config: ParserConfig2) -> PullParser {
let mut lexer = Lexer::new(&config);
if let Some(enc) = config.override_encoding {
lexer.set_encoding(enc);
}
let mut pos = Vec::with_capacity(16);
pos.push(TextPosition::new());
PullParser {
config,
lexer,
st: State::DocumentStart,
state_after_reference: State::OutsideTag,
buf: String::new(),
entities: HashMap::new(),
nst: NamespaceStack::default(),
data: MarkupData {
name: String::new(),
version: None,
encoding: None,
standalone: None,
ref_data: String::new(),
element_name: None,
quote: None,
attr_name: None,
attributes: AttributesSet::new(),
},
final_result: None,
next_event: None,
est: Vec::new(),
pos,
encountered: Encountered::None,
inside_whitespace: true,
read_prefix_separator: false,
pop_namespace: false,
}
}
pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream }
#[inline(never)]
fn set_encountered(&mut self, new_encounter: Encountered) -> Option<Result> {
if new_encounter <= self.encountered {
return None;
}
let prev_enc = self.encountered;
self.encountered = new_encounter;
if prev_enc == Encountered::None {
self.push_pos();
Some(Ok(XmlEvent::StartDocument {
version: DEFAULT_VERSION,
encoding: self.lexer.encoding().to_string(),
standalone: DEFAULT_STANDALONE,
}))
} else {
None
}
}
}
impl Position for PullParser {
#[inline]
fn position(&self) -> TextPosition {
self.pos[0]
}
}
#[derive(Copy, Clone, PartialEq)]
pub enum State {
OutsideTag,
InsideOpeningTag(OpeningTagSubstate),
InsideClosingTag(ClosingTagSubstate),
InsideProcessingInstruction(ProcessingInstructionSubstate),
InsideComment,
InsideCData,
InsideDeclaration(DeclarationSubstate),
InsideDoctype(DoctypeSubstate),
InsideReference,
DocumentStart,
}
#[derive(Copy, Clone, PartialEq)]
pub enum DoctypeSubstate {
Outside,
String,
InsideName,
BeforeEntityName,
EntityName,
BeforeEntityValue,
EntityValue,
NumericReferenceStart,
NumericReference,
PEReferenceInValue,
PEReferenceInDtd,
PEReferenceDefinitionStart,
PEReferenceDefinition,
SkipDeclaration,
Comment,
}
#[derive(Copy, Clone, PartialEq)]
pub enum OpeningTagSubstate {
InsideName,
InsideTag,
InsideAttributeName,
AfterAttributeName,
InsideAttributeValue,
AfterAttributeValue,
}
#[derive(Copy, Clone, PartialEq)]
pub enum ClosingTagSubstate {
CTInsideName,
CTAfterName,
}
#[derive(Copy, Clone, PartialEq)]
pub enum ProcessingInstructionSubstate {
PIInsideName,
PIInsideData,
}
#[derive(Copy, Clone, PartialEq)]
pub enum DeclarationSubstate {
BeforeVersion,
InsideVersion,
AfterVersion,
InsideVersionValue,
AfterVersionValue,
BeforeEncoding,
InsideEncoding,
AfterEncoding,
InsideEncodingValue,
AfterEncodingValue,
BeforeStandaloneDecl,
InsideStandaloneDecl,
AfterStandaloneDecl,
InsideStandaloneDeclValue,
AfterStandaloneDeclValue,
}
#[derive(Copy, Clone, PartialEq)]
enum QualifiedNameTarget {
AttributeNameTarget,
OpeningTagNameTarget,
ClosingTagNameTarget,
}
#[derive(Copy, Clone, PartialEq, Eq)]
enum QuoteToken {
SingleQuoteToken,
DoubleQuoteToken,
}
impl QuoteToken {
#[inline]
fn from_token(t: Token) -> Option<QuoteToken> {
match t {
Token::SingleQuote => Some(QuoteToken::SingleQuoteToken),
Token::DoubleQuote => Some(QuoteToken::DoubleQuoteToken),
_ => {
debug_assert!(false);
None
},
}
}
fn as_token(self) -> Token {
match self {
QuoteToken::SingleQuoteToken => Token::SingleQuote,
QuoteToken::DoubleQuoteToken => Token::DoubleQuote,
}
}
}
struct MarkupData {
name: String, ref_data: String, version: Option<XmlVersion>, encoding: Option<String>, standalone: Option<bool>, element_name: Option<OwnedName>, quote: Option<QuoteToken>, attr_name: Option<OwnedName>, attributes: AttributesSet, }
impl PullParser {
pub fn next<R: Read>(&mut self, r: &mut R) -> Result {
if let Some(ref ev) = self.final_result {
return ev.clone();
}
if let Some(ev) = self.next_event.take() {
return ev;
}
if self.pop_namespace {
self.pop_namespace = false;
self.nst.pop();
}
loop {
debug_assert!(self.next_event.is_none());
debug_assert!(!self.pop_namespace);
match self.lexer.next_token(r) {
Ok(Some(token)) => {
match self.dispatch_token(token) {
None => {}, Some(Ok(xml_event)) => {
self.next_pos();
return Ok(xml_event);
},
Some(Err(xml_error)) => {
self.next_pos();
return self.set_final_result(Err(xml_error));
},
}
},
Ok(None) => break,
Err(lexer_error) => {
return self.set_final_result(Err(lexer_error))
},
}
}
self.handle_eof()
}
fn handle_eof(&mut self) -> std::result::Result<XmlEvent, super::Error> {
self.next_pos();
let ev = if self.depth() == 0 {
if self.encountered == Encountered::Element && self.st == State::OutsideTag { Ok(XmlEvent::EndDocument)
} else if self.encountered < Encountered::Element {
self.error(SyntaxError::NoRootElement)
} else { self.error(SyntaxError::UnexpectedEof) }
} else if self.config.c.ignore_end_of_stream {
self.final_result = None;
self.lexer.reset_eof_handled();
return self.error(SyntaxError::UnbalancedRootElement);
} else {
self.error(SyntaxError::UnbalancedRootElement)
};
self.set_final_result(ev)
}
#[inline]
fn set_final_result(&mut self, result: Result) -> Result {
self.final_result = Some(result.clone());
result
}
#[cold]
fn error(&self, e: SyntaxError) -> Result {
Err(Error {
pos: self.lexer.position(),
kind: ErrorKind::Syntax(e.to_cow()),
})
}
#[inline]
fn next_pos(&mut self) {
if !self.pos.is_empty() {
if self.pos.len() > 1 {
self.pos.remove(0);
} else {
self.pos[0] = self.lexer.position();
}
}
}
#[inline]
#[track_caller]
fn push_pos(&mut self) {
debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events.
This case is ignored in release mode, and merely causes document positions to be out of sync.
Please file a bug and include the XML document that triggers this assert.");
if self.pos.len() != self.pos.capacity() {
self.pos.push(self.lexer.position());
} else if self.pos.len() > 1 {
self.pos.remove(0); }
}
#[inline(never)]
fn dispatch_token(&mut self, t: Token) -> Option<Result> {
match self.st {
State::OutsideTag => self.outside_tag(t),
State::InsideOpeningTag(s) => self.inside_opening_tag(t, s),
State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s),
State::InsideReference => self.inside_reference(t),
State::InsideComment => self.inside_comment(t),
State::InsideCData => self.inside_cdata(t),
State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
State::InsideDoctype(s) => self.inside_doctype(t, s),
State::InsideDeclaration(s) => self.inside_declaration(t, s),
State::DocumentStart => self.document_start(t),
}
}
#[inline]
fn depth(&self) -> usize {
self.est.len()
}
#[inline]
fn buf_has_data(&self) -> bool {
!self.buf.is_empty()
}
#[inline]
fn take_buf(&mut self) -> String {
std::mem::take(&mut self.buf)
}
#[inline]
fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> {
self.st = st;
ev
}
#[inline]
fn into_state_continue(&mut self, st: State) -> Option<Result> {
self.into_state(st, None)
}
#[inline]
fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> {
self.into_state(st, Some(ev))
}
fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result>
where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result> {
if self.buf.len() <= 1 {
self.read_prefix_separator = false;
}
let invoke_callback = move |this: &mut PullParser, t| {
let name = this.take_buf();
match name.parse() {
Ok(name) => on_name(this, t, name),
Err(()) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))),
}
};
match t {
Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => {
self.buf.push(':');
self.read_prefix_separator = true;
None
},
Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) ||
self.buf_has_data() && is_name_char(c)) => {
if self.buf.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push(c);
None
},
Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t),
Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t),
Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget ||
target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t),
Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t),
_ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))),
}
}
fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result>
where F: Fn(&mut PullParser, String) -> Option<Result> {
match t {
Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, Token::DoubleQuote | Token::SingleQuote => match self.data.quote {
None => { self.data.quote = QuoteToken::from_token(t);
None
},
Some(q) if q.as_token() == t => {
self.data.quote = None;
let value = self.take_buf();
on_value(self, value)
},
_ => {
if let Token::Character(c) = t {
if !self.is_valid_xml_char_not_restricted(c) {
return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
}
}
if self.buf.len() > self.config.max_attribute_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
t.push_to_string(&mut self.buf);
None
},
},
Token::ReferenceStart if self.data.quote.is_some() => {
self.state_after_reference = self.st;
self.into_state_continue(State::InsideReference)
},
Token::OpeningTagStart => Some(self.error(SyntaxError::UnexpectedOpeningTag)),
Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
_ if self.data.quote.is_some() => {
if self.buf.len() > self.config.max_attribute_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
t.push_to_string(&mut self.buf);
None
},
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
}
}
fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
let mut name = self.data.take_element_name()?;
let mut attributes = self.data.take_attributes().into_vec();
match self.nst.get(name.borrow().prefix_repr()) {
Some("") => name.namespace = None, Some(ns) => name.namespace = Some(ns.into()),
None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))),
}
for attr in &mut attributes {
if let Some(ref pfx) = attr.name.prefix {
let new_ns = match self.nst.get(pfx) {
Some("") => None, Some(ns) => Some(ns.into()),
None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into()))),
};
attr.name.namespace = new_ns;
}
}
if emit_end_element {
self.pop_namespace = true;
self.next_event = Some(Ok(XmlEvent::EndElement {
name: name.clone()
}));
} else {
self.est.push(name.clone());
}
let namespace = self.nst.squash();
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement {
name,
attributes,
namespace
}))
}
fn emit_end_element(&mut self) -> Option<Result> {
let mut name = self.data.take_element_name()?;
match self.nst.get(name.borrow().prefix_repr()) {
Some("") => name.namespace = None, Some(ns) => name.namespace = Some(ns.into()),
None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))),
}
let op_name = self.est.pop()?;
if name == op_name {
self.pop_namespace = true;
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name }))
} else {
Some(self.error(SyntaxError::UnexpectedClosingTag(format!("{name} != {op_name}").into())))
}
}
#[inline]
fn is_valid_xml_char(&self, c: char) -> bool {
if Some(XmlVersion::Version11) == self.data.version {
is_xml11_char(c)
} else {
is_xml10_char(c)
}
}
#[inline]
fn is_valid_xml_char_not_restricted(&self, c: char) -> bool {
if Some(XmlVersion::Version11) == self.data.version {
is_xml11_char_not_restricted(c)
} else {
is_xml10_char(c)
}
}
}
#[cfg(test)]
mod tests {
use crate::attribute::OwnedAttribute;
use crate::common::TextPosition;
use crate::name::OwnedName;
use crate::reader::events::XmlEvent;
use crate::reader::parser::PullParser;
use crate::reader::ParserConfig;
use std::io::BufReader;
fn new_parser() -> PullParser {
PullParser::new(ParserConfig::new())
}
macro_rules! expect_event(
($r:expr, $p:expr, $t:pat) => (
match $p.next(&mut $r) {
$t => {}
e => panic!("Unexpected event: {e:?}\nExpected: {}", stringify!($t))
}
);
($r:expr, $p:expr, $t:pat => $c:expr ) => (
match $p.next(&mut $r) {
$t if $c => {}
e => panic!("Unexpected event: {e:?}\nExpected: {} if {}", stringify!($t), stringify!($c))
}
)
);
macro_rules! test_data(
($d:expr) => ({
static DATA: &'static str = $d;
let r = BufReader::new(DATA.as_bytes());
let p = new_parser();
(r, p)
})
);
#[test]
fn issue_3_semicolon_in_attribute_value() {
let (mut r, mut p) = test_data!(r#"
<a attr="zzz;zzz" />
"#);
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) =>
*name == OwnedName::local("a") &&
attributes.len() == 1 &&
attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") &&
namespace.is_essentially_empty()
);
expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a"));
expect_event!(r, p, Ok(XmlEvent::EndDocument));
}
#[test]
fn issue_140_entity_reference_inside_tag() {
let (mut r, mut p) = test_data!(r"
<bla>♫</bla>
");
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla"));
expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}");
expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla"));
expect_event!(r, p, Ok(XmlEvent::EndDocument));
}
#[test]
fn issue_220_comment() {
let (mut r, mut p) = test_data!(r"<x><!-- <!--></x>");
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
expect_event!(r, p, Ok(XmlEvent::EndElement { .. }));
expect_event!(r, p, Ok(XmlEvent::EndDocument));
let (mut r, mut p) = test_data!(r"<x><!-- <!---></x>");
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r"<x><!--<text&x;> <!--></x>");
p.config.c.ignore_comments = false;
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == "<text&x;> <!");
expect_event!(r, p, Ok(XmlEvent::EndElement { .. }));
expect_event!(r, p, Ok(XmlEvent::EndDocument));
}
#[test]
fn malformed_declaration_attrs() {
let (mut r, mut p) = test_data!(r#"<?xml version x="1.0"?>"#);
expect_event!(r, p, Err(_));
let (mut r, mut p) = test_data!(r#"<?xml version="1.0" version="1.0"?>"#);
expect_event!(r, p, Err(_));
let (mut r, mut p) = test_data!(r#"<?xml version="1.0"encoding="utf-8"?>"#);
expect_event!(r, p, Err(_));
let (mut r, mut p) = test_data!(r#"<?xml version="1.0"standalone="yes"?>"#);
expect_event!(r, p, Err(_));
let (mut r, mut p) = test_data!(r#"<?xml version="1.0" encoding="utf-8"standalone="yes"?>"#);
expect_event!(r, p, Err(_));
}
#[test]
fn opening_tag_in_attribute_value() {
use crate::reader::error::{SyntaxError, Error, ErrorKind};
let (mut r, mut p) = test_data!(r#"
<a attr="zzz<zzz" />
"#);
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Err(ref e) =>
*e == Error {
kind: ErrorKind::Syntax(SyntaxError::UnexpectedOpeningTag.to_cow()),
pos: TextPosition { row: 1, column: 24 }
}
);
}
#[test]
fn reference_err() {
let (mut r, mut p) = test_data!(r"
<a>&&</a>
");
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
expect_event!(r, p, Err(_));
}
#[test]
fn state_size() {
assert_eq!(2, std::mem::size_of::<super::State>());
assert_eq!(1, std::mem::size_of::<super::DoctypeSubstate>());
}
}