1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
use crate::dnm;
use crate::dnm::SpecialTagsOption;
use crate::parallel_data::*;
use libxml::xpath::Context;
use regex::Regex;
use std::collections::HashMap;
use std::error::Error;
use std::fs::File;
use std::io::prelude::*;
use std::io::BufWriter;
use std::sync::{Arc, Mutex};
static BUFFER_CAPACITY: usize = 10_485_760;
static MAX_WORD_LENGTH: usize = 25;
pub fn extract(
corpus_path: String,
token_model_filepath: String,
discard_math: bool,
) -> Result<HashMap<String, u64>, Box<dyn Error>> {
let token_model_file = File::create(token_model_filepath)?;
let token_writer = Arc::new(Mutex::new(BufWriter::with_capacity(
BUFFER_CAPACITY,
token_model_file,
)));
let space = ' ';
let linebreak = '\n';
let is_numeric = Regex::new(r"^-?(?:\d+)(?:[a-k]|(?:\.\d+(?:[eE][+-]?\d+)?))?$").unwrap();
let mut corpus = Corpus::new(corpus_path);
if discard_math {
println!("-- will discard math.");
corpus
.dnm_parameters
.special_tag_name_options
.insert("math".to_string(), SpecialTagsOption::Skip);
corpus
.dnm_parameters
.special_tag_class_options
.insert("ltx_equation".to_string(), SpecialTagsOption::Skip);
corpus
.dnm_parameters
.special_tag_class_options
.insert("ltx_equationgroup".to_string(), SpecialTagsOption::Skip);
} else {
println!("-- will lexematize math.")
}
let corpus_counts = corpus.catalog_with_parallel_walk(|document| {
let (
mut paragraph_count,
mut word_count,
mut overflow_count,
mut formula_count,
mut citation_count,
mut num_count,
) = (0, 0, 0, 0, 0, 0);
let mut thread_buffer = String::new();
let mut context = Context::new(&document.dom).unwrap();
for mut paragraph in document.extended_paragraph_iter() {
paragraph_count += 1;
let mut paragraph_buffer = String::new();
let mut invalid_paragraph = false;
'words: for word in paragraph.word_and_punct_iter() {
if !word.range.is_empty() {
let word_string = word.range.get_plaintext().to_lowercase();
if word_string.len() > MAX_WORD_LENGTH {
overflow_count += 1;
invalid_paragraph = true;
break 'words;
}
let mut word_str: &str = &word_string;
let lexeme_str: String;
if word_string.contains("mathformula") {
if !discard_math {
lexeme_str = dnm::node::lexematize_math(word.range.get_node(), &mut context);
} else {
lexeme_str = String::new();
}
word_str = &lexeme_str;
formula_count += 1;
} else if word_string.contains("citationelement") {
word_str = "citationelement";
citation_count += 1;
} else if is_numeric.is_match(&word_string) {
num_count += 1;
word_str = "NUM";
} else {
word_count += 1;
}
paragraph_buffer.push_str(word_str);
paragraph_buffer.push(space);
}
}
if !invalid_paragraph {
thread_buffer.push(linebreak);
thread_buffer.push_str(¶graph_buffer);
}
}
token_writer
.lock()
.unwrap()
.write_all(thread_buffer.as_bytes())
.expect("thread writing to token model buffer should always succeed.");
let mut thread_counts = HashMap::new();
thread_counts.insert(String::from("document_count"), 1);
thread_counts.insert(String::from("paragraph_count"), paragraph_count);
thread_counts.insert(String::from("word_count"), word_count);
thread_counts.insert(String::from("overflow_count"), overflow_count);
thread_counts.insert(String::from("formula_count"), formula_count);
thread_counts.insert(String::from("citation_count"), citation_count);
thread_counts.insert(String::from("num_count"), num_count);
thread_counts
});
token_writer
.lock()
.unwrap()
.flush()
.expect("token writer failed to flush, data is likely incomplete.");
Ok(corpus_counts)
}