use crate::bindings::*;
use crate::c_helpers::*;
use crate::tree::*;
use std::convert::AsRef;
use std::error::Error;
use std::ffi::c_void;
use std::ffi::{CStr, CString};
use std::fmt;
use std::fs;
use std::io;
use std::os::raw::{c_char, c_int};
use std::ptr;
use std::slice;
use std::str;
enum XmlParserOption {
Recover = 1,
Nodefdtd = 4,
Noerror = 32,
Nowarning = 64,
Pedantic = 128,
Noblanks = 256,
Nonet = 2048,
Noimplied = 8192,
Compact = 65_536,
Ignoreenc = 2_097_152,
}
enum HtmlParserOption {
Recover = 1,
Nodefdtd = 4,
Noerror = 32,
Nowarning = 64,
Pedantic = 128,
Noblanks = 256,
Nonet = 2048,
Noimplied = 8192,
Compact = 65_536,
Ignoreenc = 2_097_152,
}
pub struct ParserOptions<'a> {
pub recover: bool,
pub no_def_dtd: bool,
pub no_error: bool,
pub no_warning: bool,
pub pedantic: bool,
pub no_blanks: bool,
pub no_net: bool,
pub no_implied: bool,
pub compact: bool,
pub ignore_enc: bool,
pub encoding: Option<&'a str>,
}
impl<'a> ParserOptions<'a> {
pub(crate) fn to_flags(&self, format: &ParseFormat) -> i32 {
macro_rules! to_option_flag {
(
$condition:expr => $variant:ident
) => {
if $condition {
match format {
ParseFormat::HTML => HtmlParserOption::$variant as i32,
ParseFormat::XML => XmlParserOption::$variant as i32,
}
} else {
0
}
};
}
let flags = 0;
let flags = flags + to_option_flag!(self.recover => Recover);
let flags = flags + to_option_flag!(self.no_def_dtd => Nodefdtd);
let flags = flags + to_option_flag!(self.no_error => Noerror);
let flags = flags + to_option_flag!(self.no_warning => Nowarning);
let flags = flags + to_option_flag!(self.no_warning => Nowarning);
let flags = flags + to_option_flag!(self.pedantic => Pedantic);
let flags = flags + to_option_flag!(self.no_blanks => Noblanks);
let flags = flags + to_option_flag!(self.no_net => Nonet);
let flags = flags + to_option_flag!(self.no_implied => Noimplied);
let flags = flags + to_option_flag!(self.compact => Compact);
let flags = flags + to_option_flag!(self.ignore_enc => Ignoreenc);
flags
}
}
impl<'a> Default for ParserOptions<'a> {
fn default() -> Self {
ParserOptions {
recover: true,
no_def_dtd: false,
no_error: true,
no_warning: true,
pedantic: false,
no_blanks: false,
no_net: false,
no_implied: false,
compact: false,
ignore_enc: false,
encoding: None,
}
}
}
pub enum XmlParseError {
GotNullPointer,
FileOpenError,
DocumentTooLarge,
}
impl Error for XmlParseError {}
impl fmt::Debug for XmlParseError
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self)
}
}
impl fmt::Display for XmlParseError
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", match self {
XmlParseError::GotNullPointer => "Got a Null pointer",
XmlParseError::FileOpenError => "Unable to open path to file.",
XmlParseError::DocumentTooLarge => "Document too large for i32.",
})
}
}
const DEFAULT_ENCODING: *const c_char = ptr::null();
const DEFAULT_URL: *const c_char = ptr::null();
fn xml_open(filename: &str) -> io::Result<*mut c_void> {
let ptr = Box::into_raw(Box::new(fs::File::open(filename)?));
Ok(ptr as *mut c_void)
}
unsafe extern "C" fn xml_read(context: *mut c_void, buffer: *mut c_char, len: c_int) -> c_int {
let file = context as *mut fs::File;
let buf = slice::from_raw_parts_mut(buffer as *mut u8, len as usize);
match io::Read::read(&mut *file, buf) {
Ok(v) => v as c_int,
Err(_) => -1,
}
}
type XmlReadCallback = unsafe extern "C" fn(*mut c_void, *mut c_char, c_int) -> c_int;
unsafe extern "C" fn xml_close(context: *mut c_void) -> c_int {
let file = context as *mut fs::File;
let _ = Box::from_raw(file);
0
}
type XmlCloseCallback = unsafe extern "C" fn(*mut c_void) -> c_int;
fn try_usize_to_i32(value: usize) -> Result<i32, XmlParseError> {
if cfg!(target_pointer_width = "16") || (value < i32::max_value() as usize) {
Ok(value as i32)
} else {
Err(XmlParseError::DocumentTooLarge)
}
}
#[derive(PartialEq)]
pub enum ParseFormat {
XML,
HTML,
}
pub struct Parser {
pub format: ParseFormat,
}
impl Default for Parser {
fn default() -> Self {
Parser {
format: ParseFormat::XML,
}
}
}
impl Parser {
pub fn default_html() -> Self {
Parser {
format: ParseFormat::HTML,
}
}
pub fn parse_file(&self, filename: &str) -> Result<Document, XmlParseError> {
self.parse_file_with_options(filename, ParserOptions::default())
}
pub fn parse_file_with_options(
&self,
filename: &str,
parser_options: ParserOptions,
) -> Result<Document, XmlParseError> {
let ioread: Option<XmlReadCallback> = Some(xml_read);
let ioclose: Option<XmlCloseCallback> = Some(xml_close);
let ioctx = match xml_open(filename) {
Ok(v) => v,
Err(_) => return Err(XmlParseError::FileOpenError),
};
let encoding_cstring: Option<CString> = parser_options.encoding.map(|v| CString::new(v).unwrap());
let encoding_ptr = match encoding_cstring {
Some(v) => v.as_ptr(),
None => DEFAULT_ENCODING,
};
let url_ptr = DEFAULT_URL;
unsafe {
xmlKeepBlanksDefault(1);
}
let options = parser_options.to_flags(&self.format);
match self.format {
ParseFormat::XML => {
unsafe {
let doc_ptr = xmlReadIO(ioread, ioclose, ioctx, url_ptr, encoding_ptr, options);
if doc_ptr.is_null() {
Err(XmlParseError::GotNullPointer)
} else {
Ok(Document::new_ptr(doc_ptr))
}
}
}
ParseFormat::HTML => {
unsafe {
let doc_ptr = htmlReadIO(ioread, ioclose, ioctx, url_ptr, encoding_ptr, options);
if doc_ptr.is_null() {
Err(XmlParseError::GotNullPointer)
} else {
Ok(Document::new_ptr(doc_ptr))
}
}
}
}
}
pub fn parse_string<Bytes: AsRef<[u8]>>(&self, input: Bytes) -> Result<Document, XmlParseError> {
self.parse_string_with_options(input, ParserOptions::default())
}
pub fn parse_string_with_options<Bytes: AsRef<[u8]>>(
&self,
input: Bytes,
parser_options: ParserOptions,
) -> Result<Document, XmlParseError> {
let input_bytes = input.as_ref();
let input_ptr = input_bytes.as_ptr() as *const c_char;
let input_len = try_usize_to_i32(input_bytes.len())?;
let encoding_cstring: Option<CString> = parser_options.encoding.map(|v| CString::new(v).unwrap());
let encoding_ptr = match encoding_cstring {
Some(v) => v.as_ptr(),
None => DEFAULT_ENCODING,
};
let url_ptr = DEFAULT_URL;
let options = parser_options.to_flags(&self.format);
match self.format {
ParseFormat::XML => unsafe {
let docptr = xmlReadMemory(input_ptr, input_len, url_ptr, encoding_ptr, options);
if docptr.is_null() {
Err(XmlParseError::GotNullPointer)
} else {
Ok(Document::new_ptr(docptr))
}
},
ParseFormat::HTML => unsafe {
let docptr = htmlReadMemory(input_ptr, input_len, url_ptr, encoding_ptr, options);
if docptr.is_null() {
Err(XmlParseError::GotNullPointer)
} else {
Ok(Document::new_ptr(docptr))
}
},
}
}
pub fn is_well_formed_html<Bytes: AsRef<[u8]>>(&self, input: Bytes) -> bool {
self.is_well_formed_html_with_encoding(input, None)
}
pub fn is_well_formed_html_with_encoding<Bytes: AsRef<[u8]>>(
&self,
input: Bytes,
encoding: Option<&str>,
) -> bool {
let input_bytes = input.as_ref();
if input_bytes.is_empty() {
return false;
}
let input_ptr = input_bytes.as_ptr() as *const c_char;
let input_len = match try_usize_to_i32(input_bytes.len()) {
Ok(v) => v,
Err(_) => return false,
};
let encoding_cstring: Option<CString> = encoding.map(|v| CString::new(v).unwrap());
let encoding_ptr = match encoding_cstring {
Some(v) => v.as_ptr(),
None => DEFAULT_ENCODING,
};
let url_ptr = DEFAULT_URL;
match self.format {
ParseFormat::XML => false,
ParseFormat::HTML => unsafe {
let ctxt = htmlNewParserCtxt();
setWellFormednessHandler(ctxt);
let docptr = htmlCtxtReadMemory(ctxt, input_ptr, input_len, url_ptr, encoding_ptr, 10_596);
let well_formed_final = if htmlWellFormed(ctxt) {
if !docptr.is_null() {
let node_ptr = xmlDocGetRootElement(docptr);
let name_ptr = xmlNodeGetName(node_ptr);
if name_ptr.is_null() {
false
}
else {
let c_root_name = CStr::from_ptr(name_ptr);
let root_name = str::from_utf8(c_root_name.to_bytes()).unwrap().to_owned();
root_name == "html"
}
} else {
false
}
} else {
false
};
if !ctxt.is_null() {
htmlFreeParserCtxt(ctxt);
}
if !docptr.is_null() {
xmlFreeDoc(docptr);
}
well_formed_final
},
}
}
}