1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
use jwalk::WalkDir as ParWalkDir;
use rayon::iter::ParallelBridge;
use rayon::iter::ParallelIterator;
use std::collections::HashMap;

use super::document::Document;
use crate::dnm::DNMParameters;
use crate::tokenizer::Tokenizer;

use libxml::parser::Parser;

/// A parallel iterable Corpus of HTML5 documents
pub struct Corpus {
  /// root directory
  pub path: String,
  /// document XHTML5 parser
  pub xml_parser: Parser,
  /// document HTML5 parser
  pub html_parser: Parser,
  /// `DNM`-aware sentence and word tokenizer
  pub tokenizer: Tokenizer,
  /// Default setting for `DNM` generation
  pub dnm_parameters: DNMParameters,
  /// Extension of corpus files (for specially tailored resources such as DLMF's .html5)
  /// defaults to selecting .html AND .xhtml files
  pub extension: Option<String>,
}

impl Default for Corpus {
  fn default() -> Corpus {
    Corpus {
      extension: None,
      path: ".".to_string(),
      tokenizer: Tokenizer::default(),
      xml_parser: Parser::default(),
      html_parser: Parser::default_html(),
      dnm_parameters: DNMParameters::llamapun_normalization(),
    }
  }
}

impl Corpus {
  /// Create a new parallel-processing corpus with the base directory `dirpath`
  pub fn new(dirpath: String) -> Self {
    Corpus {
      path: dirpath,
      ..Corpus::default()
    }
  }

  /// Get a parallel iterator over the documents
  pub fn catalog_with_parallel_walk<F>(&self, closure: F) -> HashMap<String, u64>
  where F: Fn(Document) -> HashMap<String, u64> + Send + Sync {
    ParWalkDir::new(self.path.clone())
      .num_threads(rayon::current_num_threads())
      .skip_hidden(true)
      .sort(false)
      .into_iter()
      .filter_map(|each| {
        if let Ok(entry) = each {
          let file_name = entry.file_name.to_str().unwrap_or("");
          let selected = if let Some(ref extension) = self.extension {
            file_name.ends_with(extension)
          } else {
            file_name.ends_with(".html") || file_name.ends_with(".xhtml")
          };
          if selected {
            let path = entry.path().to_str().unwrap_or("").to_owned();
            if !path.is_empty() {
              return Some(path);
            }
          }
        }
        // all other cases
        None
      })
      .enumerate()
      .par_bridge()
      .map(|each| {
        let (index, path) = each;
        let document = Document::new(path, self).unwrap();
        if index % 1000 == 0 && index > 0 {
          println!(
            "-- catalog_with_parallel_walk now processing document {:?}",
            1 + index
          );
        }
        closure(document)
      })
      .reduce(HashMap::new, |mut map1, map2| {
        for (k, v) in map2 {
          let entry = map1.entry(k).or_insert(0);
          *entry += v;
        }
        map1
      })
  }
}