1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
use jwalk::WalkDir as ParWalkDir;
use rayon::iter::ParallelBridge;
use rayon::iter::ParallelIterator;
use std::collections::HashMap;
use super::document::Document;
use crate::dnm::DNMParameters;
use crate::tokenizer::Tokenizer;
use libxml::parser::Parser;
pub struct Corpus {
pub path: String,
pub xml_parser: Parser,
pub html_parser: Parser,
pub tokenizer: Tokenizer,
pub dnm_parameters: DNMParameters,
pub extension: Option<String>,
}
impl Default for Corpus {
fn default() -> Corpus {
Corpus {
extension: None,
path: ".".to_string(),
tokenizer: Tokenizer::default(),
xml_parser: Parser::default(),
html_parser: Parser::default_html(),
dnm_parameters: DNMParameters::llamapun_normalization(),
}
}
}
impl Corpus {
pub fn new(dirpath: String) -> Self {
Corpus {
path: dirpath,
..Corpus::default()
}
}
pub fn catalog_with_parallel_walk<F>(&self, closure: F) -> HashMap<String, u64>
where F: Fn(Document) -> HashMap<String, u64> + Send + Sync {
ParWalkDir::new(self.path.clone())
.num_threads(rayon::current_num_threads())
.skip_hidden(true)
.sort(false)
.into_iter()
.filter_map(|each| {
if let Ok(entry) = each {
let file_name = entry.file_name.to_str().unwrap_or("");
let selected = if let Some(ref extension) = self.extension {
file_name.ends_with(extension)
} else {
file_name.ends_with(".html") || file_name.ends_with(".xhtml")
};
if selected {
let path = entry.path().to_str().unwrap_or("").to_owned();
if !path.is_empty() {
return Some(path);
}
}
}
None
})
.enumerate()
.par_bridge()
.map(|each| {
let (index, path) = each;
let document = Document::new(path, self).unwrap();
if index % 1000 == 0 && index > 0 {
println!(
"-- catalog_with_parallel_walk now processing document {:?}",
1 + index
);
}
closure(document)
})
.reduce(HashMap::new, |mut map1, map2| {
for (k, v) in map2 {
let entry = map1.entry(k).or_insert(0);
*entry += v;
}
map1
})
}
}