libxml/
parser.rs

1//! The parser functionality
2
3use crate::bindings::*;
4use crate::c_helpers::*;
5use crate::tree::*;
6
7use std::convert::AsRef;
8use std::error::Error;
9use std::ffi::c_void;
10use std::ffi::{CStr, CString};
11use std::fmt;
12use std::fs;
13use std::io;
14use std::os::raw::{c_char, c_int};
15use std::ptr;
16use std::slice;
17use std::str;
18use std::sync::Once;
19
20static INIT_LIBXML_PARSER: Once = Once::new();
21
22enum XmlParserOption {
23  Recover = 1,
24  Nodefdtd = 4,
25  Noerror = 32,
26  Nowarning = 64,
27  Pedantic = 128,
28  Noblanks = 256,
29  Nonet = 2048,
30  Noimplied = 8192,
31  Compact = 65_536,
32  Huge = 524_288,
33  Ignoreenc = 2_097_152,
34}
35
36enum HtmlParserOption {
37  Recover = 1,
38  Nodefdtd = 4,
39  Noerror = 32,
40  Nowarning = 64,
41  Pedantic = 128,
42  Noblanks = 256,
43  Nonet = 2048,
44  Noimplied = 8192,
45  Huge = 524_288,
46  Compact = 65_536,
47  Ignoreenc = 2_097_152,
48}
49
50/// Parser Options
51pub struct ParserOptions<'a> {
52  /// Relaxed parsing
53  pub recover: bool,
54  /// do not default a doctype if not found
55  pub no_def_dtd: bool,
56  /// do not default a doctype if not found
57  pub no_error: bool,
58  /// suppress warning reports
59  pub no_warning: bool,
60  /// pedantic error reporting
61  pub pedantic: bool,
62  /// remove blank nodes
63  pub no_blanks: bool,
64  /// Forbid network access
65  pub no_net: bool,
66  /// Do not add implied html/body... elements
67  pub no_implied: bool,
68  /// relax any hardcoded limit from the parser
69  pub huge: bool,
70  /// compact small text nodes
71  pub compact: bool,
72  /// ignore internal document encoding hint
73  pub ignore_enc: bool,
74  /// manually-specified encoding
75  pub encoding: Option<&'a str>,
76}
77
78impl ParserOptions<'_> {
79  pub(crate) fn to_flags(&self, format: &ParseFormat) -> i32 {
80    macro_rules! to_option_flag {
81      (
82        $condition:expr => $variant:ident
83      ) => {
84        if $condition {
85          match format {
86            ParseFormat::HTML => HtmlParserOption::$variant as i32,
87            ParseFormat::XML => XmlParserOption::$variant as i32,
88          }
89        } else {
90          0
91        }
92      };
93    }
94    // return the combined flags
95    to_option_flag!(self.recover => Recover)
96      + to_option_flag!(self.no_def_dtd => Nodefdtd)
97      + to_option_flag!(self.no_error => Noerror)
98      + to_option_flag!(self.no_warning => Nowarning)
99      + to_option_flag!(self.pedantic => Pedantic)
100      + to_option_flag!(self.no_blanks => Noblanks)
101      + to_option_flag!(self.no_net => Nonet)
102      + to_option_flag!(self.no_implied => Noimplied)
103      + to_option_flag!(self.huge => Huge)
104      + to_option_flag!(self.compact => Compact)
105      + to_option_flag!(self.ignore_enc => Ignoreenc)
106  }
107}
108
109impl Default for ParserOptions<'_> {
110  fn default() -> Self {
111    ParserOptions {
112      recover: true,
113      no_def_dtd: false,
114      no_error: true,
115      no_warning: true,
116      pedantic: false,
117      no_blanks: false,
118      no_net: false,
119      no_implied: false,
120      huge: false,
121      compact: false,
122      ignore_enc: false,
123      encoding: None,
124    }
125  }
126}
127
128///Parser Errors
129pub enum XmlParseError {
130  ///Parsing returned a null pointer as document pointer
131  GotNullPointer,
132  ///Could not open file error.
133  FileOpenError,
134  ///Document too large for libxml2.
135  DocumentTooLarge,
136}
137
138impl Error for XmlParseError {}
139
140impl fmt::Debug for XmlParseError {
141  fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
142    write!(f, "{self}")
143  }
144}
145
146impl fmt::Display for XmlParseError {
147  fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
148    write!(
149      f,
150      "{}",
151      match self {
152        XmlParseError::GotNullPointer => "Got a Null pointer",
153        XmlParseError::FileOpenError => "Unable to open path to file.",
154        XmlParseError::DocumentTooLarge => "Document too large for i32.",
155      }
156    )
157  }
158}
159
160/// Default encoding when not provided.
161const DEFAULT_ENCODING: *const c_char = ptr::null();
162
163/// Default URL when not provided.
164const DEFAULT_URL: *const c_char = ptr::null();
165
166/// Open file function.
167fn xml_open(filename: &str) -> io::Result<*mut c_void> {
168  let ptr = Box::into_raw(Box::new(fs::File::open(filename)?));
169  Ok(ptr as *mut c_void)
170}
171
172/// Read callback for an FS file.
173unsafe extern "C" fn xml_read(context: *mut c_void, buffer: *mut c_char, len: c_int) -> c_int {
174  // Len is always positive, typically 40-4000 bytes.
175  let file = context as *mut fs::File;
176  let buf = slice::from_raw_parts_mut(buffer as *mut u8, len as usize);
177  match io::Read::read(&mut *file, buf) {
178    Ok(v) => v as c_int,
179    Err(_) => -1,
180  }
181}
182
183type XmlReadCallback = unsafe extern "C" fn(*mut c_void, *mut c_char, c_int) -> c_int;
184
185/// Close callback for an FS file.
186unsafe extern "C" fn xml_close(context: *mut c_void) -> c_int {
187  // Take rust ownership of the context and then drop it.
188  let file = context as *mut fs::File;
189  let _ = Box::from_raw(file);
190  0
191}
192
193type XmlCloseCallback = unsafe extern "C" fn(*mut c_void) -> c_int;
194
195///Convert usize to i32 safely.
196fn try_usize_to_i32(value: usize) -> Result<i32, XmlParseError> {
197  if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) {
198    // Cannot safely use our value comparison, but the conversion if always safe.
199    // Or, if the value can be safely represented as a 32-bit signed integer.
200    Ok(value as i32)
201  } else {
202    // Document too large, cannot parse using libxml2.
203    Err(XmlParseError::DocumentTooLarge)
204  }
205}
206
207#[derive(Debug, PartialEq, Eq)]
208/// Enum for the parse formats supported by libxml2
209pub enum ParseFormat {
210  /// Strict parsing for XML
211  XML,
212  /// Relaxed parsing for HTML
213  HTML,
214}
215/// Parsing API wrapper for libxml2
216pub struct Parser {
217  /// The `ParseFormat` for this parser
218  pub format: ParseFormat,
219}
220impl Default for Parser {
221  /// Create a parser for XML documents
222  fn default() -> Self {
223    // avoid deadlocks from using multiple parsers
224    INIT_LIBXML_PARSER.call_once(|| unsafe {
225      crate::bindings::xmlInitParser();
226    });
227    Parser {
228      format: ParseFormat::XML,
229    }
230  }
231}
232impl Parser {
233  /// Create a parser for HTML documents
234  pub fn default_html() -> Self {
235    // avoid deadlocks from using multiple parsers
236    INIT_LIBXML_PARSER.call_once(|| unsafe {
237      crate::bindings::xmlInitParser();
238    });
239    Parser {
240      format: ParseFormat::HTML,
241    }
242  }
243
244  /// Parses the XML/HTML file `filename` to generate a new `Document`
245  pub fn parse_file(&self, filename: &str) -> Result<Document, XmlParseError> {
246    self.parse_file_with_options(filename, ParserOptions::default())
247  }
248
249  /// Parses the XML/HTML file `filename` with a manually-specified parser-options
250  /// to generate a new `Document`
251  pub fn parse_file_with_options(
252    &self,
253    filename: &str,
254    parser_options: ParserOptions,
255  ) -> Result<Document, XmlParseError> {
256    // Create extern C callbacks for to read and close a Rust file through
257    // a void pointer.
258    let ioread: Option<XmlReadCallback> = Some(xml_read);
259    let ioclose: Option<XmlCloseCallback> = Some(xml_close);
260    let ioctx = match xml_open(filename) {
261      Ok(v) => v,
262      Err(_) => return Err(XmlParseError::FileOpenError),
263    };
264
265    // Process encoding.
266    let encoding_cstring: Option<CString> =
267      parser_options.encoding.map(|v| CString::new(v).unwrap());
268    let encoding_ptr = match encoding_cstring {
269      Some(v) => v.as_ptr(),
270      None => DEFAULT_ENCODING,
271    };
272
273    // Process url.
274    let url_ptr = DEFAULT_URL;
275
276    unsafe {
277      xmlKeepBlanksDefault(1);
278    }
279
280    let options = parser_options.to_flags(&self.format);
281
282    match self.format {
283      ParseFormat::XML => unsafe {
284        let doc_ptr = xmlReadIO(ioread, ioclose, ioctx, url_ptr, encoding_ptr, options);
285        if doc_ptr.is_null() {
286          Err(XmlParseError::GotNullPointer)
287        } else {
288          Ok(Document::new_ptr(doc_ptr))
289        }
290      },
291      ParseFormat::HTML => unsafe {
292        let doc_ptr = htmlReadIO(ioread, ioclose, ioctx, url_ptr, encoding_ptr, options);
293        if doc_ptr.is_null() {
294          Err(XmlParseError::GotNullPointer)
295        } else {
296          Ok(Document::new_ptr(doc_ptr))
297        }
298      },
299    }
300  }
301
302  ///Parses the XML/HTML bytes `input` to generate a new `Document`
303  pub fn parse_string<Bytes: AsRef<[u8]>>(&self, input: Bytes) -> Result<Document, XmlParseError> {
304    self.parse_string_with_options(input, ParserOptions::default())
305  }
306
307  ///Parses the XML/HTML bytes `input` with a manually-specified
308  ///parser-options to generate a new `Document`
309  pub fn parse_string_with_options<Bytes: AsRef<[u8]>>(
310    &self,
311    input: Bytes,
312    parser_options: ParserOptions,
313  ) -> Result<Document, XmlParseError> {
314    // Process input bytes.
315    let input_bytes = input.as_ref();
316    let input_ptr = input_bytes.as_ptr() as *const c_char;
317    let input_len = try_usize_to_i32(input_bytes.len())?;
318
319    // Process encoding.
320    let encoding_cstring: Option<CString> =
321      parser_options.encoding.map(|v| CString::new(v).unwrap());
322    let encoding_ptr = match encoding_cstring {
323      Some(v) => v.as_ptr(),
324      None => DEFAULT_ENCODING,
325    };
326
327    // Process url.
328    let url_ptr = DEFAULT_URL;
329
330    let options = parser_options.to_flags(&self.format);
331
332    match self.format {
333      ParseFormat::XML => unsafe {
334        let docptr = xmlReadMemory(input_ptr, input_len, url_ptr, encoding_ptr, options);
335        if docptr.is_null() {
336          Err(XmlParseError::GotNullPointer)
337        } else {
338          Ok(Document::new_ptr(docptr))
339        }
340      },
341      ParseFormat::HTML => unsafe {
342        let docptr = htmlReadMemory(input_ptr, input_len, url_ptr, encoding_ptr, options);
343        if docptr.is_null() {
344          Err(XmlParseError::GotNullPointer)
345        } else {
346          Ok(Document::new_ptr(docptr))
347        }
348      },
349    }
350  }
351
352  /// Checks a string for well-formedness.
353  pub fn is_well_formed_html<Bytes: AsRef<[u8]>>(&self, input: Bytes) -> bool {
354    self.is_well_formed_html_with_encoding(input, None)
355  }
356
357  /// Checks a string for well-formedness with manually-specified encoding.
358  /// IMPORTANT: This function is currently implemented in a HACKY way, to ignore invalid errors for HTML5 elements (such as <math>)
359  ///            this means you should NEVER USE IT WHILE THREADING, it is CERTAIN TO BREAK
360  ///
361  /// Help is welcome in implementing it correctly.
362  pub fn is_well_formed_html_with_encoding<Bytes: AsRef<[u8]>>(
363    &self,
364    input: Bytes,
365    encoding: Option<&str>,
366  ) -> bool {
367    // Process input string.
368    let input_bytes = input.as_ref();
369    if input_bytes.is_empty() {
370      return false;
371    }
372    let input_ptr = input_bytes.as_ptr() as *const c_char;
373    let input_len = match try_usize_to_i32(input_bytes.len()) {
374      Ok(v) => v,
375      Err(_) => return false,
376    };
377
378    // Process encoding.
379    let encoding_cstring: Option<CString> = encoding.map(|v| CString::new(v).unwrap());
380    let encoding_ptr = match encoding_cstring {
381      Some(v) => v.as_ptr(),
382      None => DEFAULT_ENCODING,
383    };
384
385    // Process url.
386    let url_ptr = DEFAULT_URL;
387    // disable generic error lines from libxml2
388    match self.format {
389      ParseFormat::XML => false, // TODO: Add support for XML at some point
390      ParseFormat::HTML => unsafe {
391        let ctxt = htmlNewParserCtxt();
392        setWellFormednessHandler(ctxt);
393        let docptr = htmlCtxtReadMemory(ctxt, input_ptr, input_len, url_ptr, encoding_ptr, 10_596); // htmlParserOption = 4+32+64+256+2048+8192
394        let well_formed_final = if htmlWellFormed(ctxt) {
395          // Basic well-formedness passes, let's check if we have an <html> element as root too
396          if !docptr.is_null() {
397            let node_ptr = xmlDocGetRootElement(docptr);
398            if node_ptr.is_null() {
399              return false
400            }
401            let name_ptr = xmlNodeGetName(node_ptr);
402            if name_ptr.is_null() {
403              false
404            }
405            //empty string
406            else {
407              let c_root_name = CStr::from_ptr(name_ptr);
408              let root_name = str::from_utf8(c_root_name.to_bytes()).unwrap().to_owned();
409              root_name == "html"
410            }
411          } else {
412            false
413          }
414        } else {
415          false
416        };
417
418        if !ctxt.is_null() {
419          htmlFreeParserCtxt(ctxt);
420        }
421        if !docptr.is_null() {
422          xmlFreeDoc(docptr);
423        }
424        well_formed_final
425      },
426    }
427  }
428}