1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
use libxml::xpath::Context;
use crate::data::{Corpus, Document};
use crate::util::data_helpers;
use crate::util::data_helpers::LexicalOptions;
pub fn path_to_words(path: String) -> String {
let corpus = Corpus::default();
let mut document = Document::new(path, &corpus).unwrap();
let mut context = Context::new(&document.dom).unwrap();
let mut document_buffer = String::new();
for mut paragraph in document.paragraph_iter() {
let mut invalid_paragraph = false;
let mut paragraph_buffer = String::new();
'sentences: for mut sentence in paragraph.iter() {
let mut sentence_buffer = String::new();
for word in sentence.simple_iter() {
if !word.range.is_empty() {
let word_string = match data_helpers::ams_normalize_word_range(
&word.range,
&mut context,
LexicalOptions::default(),
) {
Ok(w) => w,
Err(_) => {
invalid_paragraph = true;
break 'sentences;
},
};
sentence_buffer.push_str(&word_string);
sentence_buffer.push(' ');
}
}
if !sentence_buffer.is_empty() {
paragraph_buffer.push_str(&sentence_buffer);
paragraph_buffer.push('\n');
}
}
if !invalid_paragraph {
document_buffer.push_str(¶graph_buffer);
}
}
document_buffer
}