1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
use circular_queue::CircularQueue;
use std::collections::HashMap;
#[derive(Debug, Default)]
pub struct Dictionary {
pub map: HashMap<String, usize>,
index: usize,
}
impl Dictionary {
pub fn new() -> Self { Dictionary::default() }
pub fn insert(&mut self, word: String) {
let map = &mut self.map;
let word_index = map.entry(word).or_insert(self.index + 1);
if *word_index > self.index {
self.index += 1;
}
}
pub fn sorted(&self) -> Vec<(&String, usize)> {
let mut as_vec = self.map.iter().map(|(x, y)| (x, *y)).collect::<Vec<_>>();
as_vec.sort_by(|a, b| b.1.cmp(&a.1));
as_vec
}
pub fn count(&self) -> usize { self.index }
}
pub struct Ngrams {
pub anchor: Option<String>,
pub window_size: usize,
pub n: usize,
pub counts: HashMap<String, usize>,
}
impl Default for Ngrams {
fn default() -> Ngrams {
Ngrams {
anchor: None,
window_size: 0,
n: 1,
counts: HashMap::new(),
}
}
}
#[derive(Debug, Copy, Clone, PartialEq)]
enum AnchorSide {
Left,
Right,
}
impl Ngrams {
pub fn get(&self, word: &str) -> usize {
match self.counts.get(word) {
Some(count) => *count,
None => 0,
}
}
pub fn insert(&mut self, phrase: String) {
let counter = self.counts.entry(phrase).or_insert(0);
*counter += 1;
}
pub fn sorted(&self) -> Vec<(&String, usize)> {
let mut as_vec = self.counts.iter().map(|(x, y)| (x, *y)).collect::<Vec<_>>();
as_vec.sort_by(|a, b| b.1.cmp(&a.1));
as_vec
}
pub fn distinct_count(&self) -> usize { self.counts.len() }
pub fn add_content(&mut self, content: &str) {
if self.anchor.is_some() && self.window_size > 0 {
self.add_anchored_content(content)
} else {
unimplemented!(); }
}
pub fn add_anchored_content(&mut self, content: &str) {
let mut continuous_buffer = Vec::new();
let mut context_window = CircularQueue::with_capacity(self.window_size);
let mut words_since_anchor_seen = 0;
let mut side = AnchorSide::Left;
for w in content
.split_ascii_whitespace()
.filter(|&w| w.chars().next().unwrap().is_alphanumeric())
{
context_window.push(w);
let anchor = self.anchor.as_ref().unwrap();
if w == anchor {
words_since_anchor_seen = 0;
side = AnchorSide::Right;
continuous_buffer = context_window.asc_iter().copied().collect();
context_window.clear();
} else {
words_since_anchor_seen += 1;
if words_since_anchor_seen == self.window_size && side == AnchorSide::Right {
self.record_words(continuous_buffer.drain(..).collect());
context_window.clear();
side = AnchorSide::Left;
}
}
}
continuous_buffer.extend(context_window.asc_iter().copied());
self.record_words(continuous_buffer.drain(..).collect());
}
pub fn record_words(&mut self, words: Vec<&str>) {
if words.len() < self.n {
return;
}
let mut gram_window = CircularQueue::with_capacity(self.n);
for w in words.into_iter() {
gram_window.push(w);
if gram_window.len() == self.n {
let key = gram_window
.asc_iter()
.copied()
.collect::<Vec<_>>()
.join(" ");
self.insert(key);
}
}
}
}