1use crate::bindings::*;
4use crate::c_helpers::*;
5use crate::tree::*;
6
7use std::convert::AsRef;
8use std::error::Error;
9use std::ffi::c_void;
10use std::ffi::{CStr, CString};
11use std::fmt;
12use std::fs;
13use std::io;
14use std::os::raw::{c_char, c_int};
15use std::ptr;
16use std::slice;
17use std::str;
18use std::sync::Once;
19
20static INIT_LIBXML_PARSER: Once = Once::new();
21
22enum XmlParserOption {
23 Recover = 1,
24 Nodefdtd = 4,
25 Noerror = 32,
26 Nowarning = 64,
27 Pedantic = 128,
28 Noblanks = 256,
29 Nonet = 2048,
30 Noimplied = 8192,
31 Compact = 65_536,
32 Huge = 524_288,
33 Ignoreenc = 2_097_152,
34}
35
36enum HtmlParserOption {
37 Recover = 1,
38 Nodefdtd = 4,
39 Noerror = 32,
40 Nowarning = 64,
41 Pedantic = 128,
42 Noblanks = 256,
43 Nonet = 2048,
44 Noimplied = 8192,
45 Huge = 524_288,
46 Compact = 65_536,
47 Ignoreenc = 2_097_152,
48}
49
50pub struct ParserOptions<'a> {
52 pub recover: bool,
54 pub no_def_dtd: bool,
56 pub no_error: bool,
58 pub no_warning: bool,
60 pub pedantic: bool,
62 pub no_blanks: bool,
64 pub no_net: bool,
66 pub no_implied: bool,
68 pub huge: bool,
70 pub compact: bool,
72 pub ignore_enc: bool,
74 pub encoding: Option<&'a str>,
76}
77
78impl ParserOptions<'_> {
79 pub(crate) fn to_flags(&self, format: &ParseFormat) -> i32 {
80 macro_rules! to_option_flag {
81 (
82 $condition:expr => $variant:ident
83 ) => {
84 if $condition {
85 match format {
86 ParseFormat::HTML => HtmlParserOption::$variant as i32,
87 ParseFormat::XML => XmlParserOption::$variant as i32,
88 }
89 } else {
90 0
91 }
92 };
93 }
94 to_option_flag!(self.recover => Recover)
96 + to_option_flag!(self.no_def_dtd => Nodefdtd)
97 + to_option_flag!(self.no_error => Noerror)
98 + to_option_flag!(self.no_warning => Nowarning)
99 + to_option_flag!(self.pedantic => Pedantic)
100 + to_option_flag!(self.no_blanks => Noblanks)
101 + to_option_flag!(self.no_net => Nonet)
102 + to_option_flag!(self.no_implied => Noimplied)
103 + to_option_flag!(self.huge => Huge)
104 + to_option_flag!(self.compact => Compact)
105 + to_option_flag!(self.ignore_enc => Ignoreenc)
106 }
107}
108
109impl Default for ParserOptions<'_> {
110 fn default() -> Self {
111 ParserOptions {
112 recover: true,
113 no_def_dtd: false,
114 no_error: true,
115 no_warning: true,
116 pedantic: false,
117 no_blanks: false,
118 no_net: false,
119 no_implied: false,
120 huge: false,
121 compact: false,
122 ignore_enc: false,
123 encoding: None,
124 }
125 }
126}
127
128pub enum XmlParseError {
130 GotNullPointer,
132 FileOpenError,
134 DocumentTooLarge,
136}
137
138impl Error for XmlParseError {}
139
140impl fmt::Debug for XmlParseError {
141 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
142 write!(f, "{self}")
143 }
144}
145
146impl fmt::Display for XmlParseError {
147 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
148 write!(
149 f,
150 "{}",
151 match self {
152 XmlParseError::GotNullPointer => "Got a Null pointer",
153 XmlParseError::FileOpenError => "Unable to open path to file.",
154 XmlParseError::DocumentTooLarge => "Document too large for i32.",
155 }
156 )
157 }
158}
159
160const DEFAULT_ENCODING: *const c_char = ptr::null();
162
163const DEFAULT_URL: *const c_char = ptr::null();
165
166fn xml_open(filename: &str) -> io::Result<*mut c_void> {
168 let ptr = Box::into_raw(Box::new(fs::File::open(filename)?));
169 Ok(ptr as *mut c_void)
170}
171
172unsafe extern "C" fn xml_read(context: *mut c_void, buffer: *mut c_char, len: c_int) -> c_int {
174 let file = context as *mut fs::File;
176 let buf = slice::from_raw_parts_mut(buffer as *mut u8, len as usize);
177 match io::Read::read(&mut *file, buf) {
178 Ok(v) => v as c_int,
179 Err(_) => -1,
180 }
181}
182
183type XmlReadCallback = unsafe extern "C" fn(*mut c_void, *mut c_char, c_int) -> c_int;
184
185unsafe extern "C" fn xml_close(context: *mut c_void) -> c_int {
187 let file = context as *mut fs::File;
189 let _ = Box::from_raw(file);
190 0
191}
192
193type XmlCloseCallback = unsafe extern "C" fn(*mut c_void) -> c_int;
194
195fn try_usize_to_i32(value: usize) -> Result<i32, XmlParseError> {
197 if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) {
198 Ok(value as i32)
201 } else {
202 Err(XmlParseError::DocumentTooLarge)
204 }
205}
206
207#[derive(Debug, PartialEq, Eq)]
208pub enum ParseFormat {
210 XML,
212 HTML,
214}
215pub struct Parser {
217 pub format: ParseFormat,
219}
220impl Default for Parser {
221 fn default() -> Self {
223 INIT_LIBXML_PARSER.call_once(|| unsafe {
225 crate::bindings::xmlInitParser();
226 });
227 Parser {
228 format: ParseFormat::XML,
229 }
230 }
231}
232impl Parser {
233 pub fn default_html() -> Self {
235 INIT_LIBXML_PARSER.call_once(|| unsafe {
237 crate::bindings::xmlInitParser();
238 });
239 Parser {
240 format: ParseFormat::HTML,
241 }
242 }
243
244 pub fn parse_file(&self, filename: &str) -> Result<Document, XmlParseError> {
246 self.parse_file_with_options(filename, ParserOptions::default())
247 }
248
249 pub fn parse_file_with_options(
252 &self,
253 filename: &str,
254 parser_options: ParserOptions,
255 ) -> Result<Document, XmlParseError> {
256 let ioread: Option<XmlReadCallback> = Some(xml_read);
259 let ioclose: Option<XmlCloseCallback> = Some(xml_close);
260 let ioctx = match xml_open(filename) {
261 Ok(v) => v,
262 Err(_) => return Err(XmlParseError::FileOpenError),
263 };
264
265 let encoding_cstring: Option<CString> =
267 parser_options.encoding.map(|v| CString::new(v).unwrap());
268 let encoding_ptr = match encoding_cstring {
269 Some(v) => v.as_ptr(),
270 None => DEFAULT_ENCODING,
271 };
272
273 let url_ptr = DEFAULT_URL;
275
276 unsafe {
277 xmlKeepBlanksDefault(1);
278 }
279
280 let options = parser_options.to_flags(&self.format);
281
282 match self.format {
283 ParseFormat::XML => unsafe {
284 let doc_ptr = xmlReadIO(ioread, ioclose, ioctx, url_ptr, encoding_ptr, options);
285 if doc_ptr.is_null() {
286 Err(XmlParseError::GotNullPointer)
287 } else {
288 Ok(Document::new_ptr(doc_ptr))
289 }
290 },
291 ParseFormat::HTML => unsafe {
292 let doc_ptr = htmlReadIO(ioread, ioclose, ioctx, url_ptr, encoding_ptr, options);
293 if doc_ptr.is_null() {
294 Err(XmlParseError::GotNullPointer)
295 } else {
296 Ok(Document::new_ptr(doc_ptr))
297 }
298 },
299 }
300 }
301
302 pub fn parse_string<Bytes: AsRef<[u8]>>(&self, input: Bytes) -> Result<Document, XmlParseError> {
304 self.parse_string_with_options(input, ParserOptions::default())
305 }
306
307 pub fn parse_string_with_options<Bytes: AsRef<[u8]>>(
310 &self,
311 input: Bytes,
312 parser_options: ParserOptions,
313 ) -> Result<Document, XmlParseError> {
314 let input_bytes = input.as_ref();
316 let input_ptr = input_bytes.as_ptr() as *const c_char;
317 let input_len = try_usize_to_i32(input_bytes.len())?;
318
319 let encoding_cstring: Option<CString> =
321 parser_options.encoding.map(|v| CString::new(v).unwrap());
322 let encoding_ptr = match encoding_cstring {
323 Some(v) => v.as_ptr(),
324 None => DEFAULT_ENCODING,
325 };
326
327 let url_ptr = DEFAULT_URL;
329
330 let options = parser_options.to_flags(&self.format);
331
332 match self.format {
333 ParseFormat::XML => unsafe {
334 let docptr = xmlReadMemory(input_ptr, input_len, url_ptr, encoding_ptr, options);
335 if docptr.is_null() {
336 Err(XmlParseError::GotNullPointer)
337 } else {
338 Ok(Document::new_ptr(docptr))
339 }
340 },
341 ParseFormat::HTML => unsafe {
342 let docptr = htmlReadMemory(input_ptr, input_len, url_ptr, encoding_ptr, options);
343 if docptr.is_null() {
344 Err(XmlParseError::GotNullPointer)
345 } else {
346 Ok(Document::new_ptr(docptr))
347 }
348 },
349 }
350 }
351
352 pub fn is_well_formed_html<Bytes: AsRef<[u8]>>(&self, input: Bytes) -> bool {
354 self.is_well_formed_html_with_encoding(input, None)
355 }
356
357 pub fn is_well_formed_html_with_encoding<Bytes: AsRef<[u8]>>(
363 &self,
364 input: Bytes,
365 encoding: Option<&str>,
366 ) -> bool {
367 let input_bytes = input.as_ref();
369 if input_bytes.is_empty() {
370 return false;
371 }
372 let input_ptr = input_bytes.as_ptr() as *const c_char;
373 let input_len = match try_usize_to_i32(input_bytes.len()) {
374 Ok(v) => v,
375 Err(_) => return false,
376 };
377
378 let encoding_cstring: Option<CString> = encoding.map(|v| CString::new(v).unwrap());
380 let encoding_ptr = match encoding_cstring {
381 Some(v) => v.as_ptr(),
382 None => DEFAULT_ENCODING,
383 };
384
385 let url_ptr = DEFAULT_URL;
387 match self.format {
389 ParseFormat::XML => false, ParseFormat::HTML => unsafe {
391 let ctxt = htmlNewParserCtxt();
392 setWellFormednessHandler(ctxt);
393 let docptr = htmlCtxtReadMemory(ctxt, input_ptr, input_len, url_ptr, encoding_ptr, 10_596); let well_formed_final = if htmlWellFormed(ctxt) {
395 if !docptr.is_null() {
397 let node_ptr = xmlDocGetRootElement(docptr);
398 if node_ptr.is_null() {
399 return false
400 }
401 let name_ptr = xmlNodeGetName(node_ptr);
402 if name_ptr.is_null() {
403 false
404 }
405 else {
407 let c_root_name = CStr::from_ptr(name_ptr);
408 let root_name = str::from_utf8(c_root_name.to_bytes()).unwrap().to_owned();
409 root_name == "html"
410 }
411 } else {
412 false
413 }
414 } else {
415 false
416 };
417
418 if !ctxt.is_null() {
419 htmlFreeParserCtxt(ctxt);
420 }
421 if !docptr.is_null() {
422 xmlFreeDoc(docptr);
423 }
424 well_formed_final
425 },
426 }
427 }
428}