-
Notifications
You must be signed in to change notification settings - Fork 38
Expand file tree
/
Copy pathstrtk_bloom_filter_example.cpp
More file actions
235 lines (187 loc) · 7.44 KB
/
strtk_bloom_filter_example.cpp
File metadata and controls
235 lines (187 loc) · 7.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/*
*****************************************************************
* String Toolkit Library *
* *
* Bloom Filter Example *
* Author: Arash Partow (2002-2020) *
* URL: http://www.partow.net/programming/strtk/index.html *
* *
* Copyright notice: *
* Free use of the String Toolkit Library is permitted under the *
* guidelines and in accordance with the most current version of *
* the MIT License. *
* http://www.opensource.org/licenses/MIT *
* *
*****************************************************************
*/
/*
Description: This example demonstrates the usage pattern for the bloom
filter data structure. The example goes through all 26
-choose-14 unique combinations of letters(a-z), adds each of
them as a string representation to the bloom filter in both
all upper and all lower case, then proceeds to test their
existence within the filter, noting any errors.
Once that is complete, another set of strings composed of
sub-strings of length 7 representing the first and second
halves of the original set of strings is then iterated
through, and subsequently their existence within the bloom
filter is tested, in the event a result of true is returned
the false positive count is incremented.
At the end of the process the bloom filter is persisted to
disk, read back from disk into a new bloom filter, then each
element from the original set is tested against the newly
constructed bloom filter, noting any errors. It should be
noted that the entropy of the persisted bloom filter is a
good indicator as to the overall performance of the filter.
*/
#include <cstddef>
#include <iostream>
#include <string>
#include "strtk.hpp"
int main()
{
static std::string letters = "abcdefghijklmnopqrstuvwxyz";
// n choose k
static const std::size_t n = letters.size();
static const std::size_t k = 14;
static const std::size_t element_count = 2 * static_cast<std::size_t>(strtk::n_choose_k(n,k));
static const double false_positive_probability = 0.0001;
typedef strtk::combination_iterator<char*> iterator_type;
strtk::bloom::parameters parameters;
parameters.projected_element_count = element_count;
parameters.false_positive_probability = false_positive_probability;
parameters.random_seed = strtk::magic_seed;
parameters.maximum_number_of_hashes = 7;
if (!parameters)
{
std::cout << "Error - Invalid set of bloom filter parameters!" << std::endl;
return 1;
}
parameters.compute_optimal_parameters();
strtk::bloom::filter filter(parameters);
printf("Filter Size: %7.3fKB "
"Data Size: %8.3fKB "
"Hash Count: %llu\n",
filter.size() / (8.0 * strtk::one_kilobyte),
(1.0 * element_count * k) / strtk::one_kilobyte,
static_cast<unsigned long long>(filter.hash_count()));
static const std::size_t buffer_size = k * element_count;
unsigned char* buffer = new unsigned char[buffer_size];
{
iterator_type itr(k,letters);
const iterator_type end(letters);
unsigned char* buf_itr = buffer;
unsigned int real_elem = 0;
while (end != itr)
{
iterator_type::range_type range = *itr;
std::string s(range.first,range.second);
std::copy(s.data(),s.data() + s.size(),buf_itr);
buf_itr+= s.size();
strtk::convert_to_uppercase(s);
std::copy(s.data(),s.data() + s.size(),buf_itr);
buf_itr+= s.size();
real_elem += 2;
++itr;
}
}
{
unsigned char* buf_itr = buffer;
const unsigned char* buf_end = buffer + buffer_size;
strtk::util::timer timer;
timer.start();
while (buf_end != buf_itr)
{
filter.insert(buf_itr,k);
buf_itr += k;
}
timer.stop();
printf("[insert ] Element Count: %llu\tTotal Time: %5.3fsec\tRate: %10.3felem/sec\n",
static_cast<unsigned long long>(filter.element_count()),
timer.time(),
element_count / timer.time());
}
{
unsigned char* buf_itr = buffer;
const unsigned char* buf_end = buffer + buffer_size;
strtk::util::timer timer;
timer.start();
while (buf_end != buf_itr)
{
if (!filter.contains(buf_itr,k))
{
std::cout << "Error: Failed to find: " << std::string(buf_itr,buf_itr + k) << std::endl;
}
buf_itr += k;
}
timer.stop();
printf("[contain] Element Count: %llu\tTotal Time: %5.3fsec\tRate: %10.3felem/sec\n",
static_cast<unsigned long long>(filter.element_count()),
timer.time(),
element_count / timer.time());
}
{
static const std::size_t small_size = k / 2;
unsigned int false_positive_count = 0;
unsigned int small_element_count = 0;
unsigned char* buf_itr = buffer;
const unsigned char* buf_end = buffer + buffer_size;
strtk::util::timer timer;
timer.start();
while (buf_end != buf_itr)
{
if (filter.contains(buf_itr,small_size))
{
++false_positive_count;
}
++small_element_count;
buf_itr += small_size;
}
timer.stop();
printf("[FPC ] Element Count: %llu\tFalse Positive Count: %d\tFalse Positive Probability: %9.8f\tTotal Time: %5.3fsec\tRate: %10.3felem/sec\n",
static_cast<unsigned long long>(small_element_count),
false_positive_count,
(1.0 * false_positive_count)/small_element_count,
timer.time(),
small_element_count / timer.time());
}
{
strtk::bloom::filter secondary_filter;
if (!filter.write_to_file("bloom_filter.bin"))
{
std::cout << "Error - Failed to write filter to file!" << std::endl;
return 1;
}
else if (!secondary_filter.read_from_file("bloom_filter.bin"))
{
std::cout << "Error - Failed to read filter from file!" << std::endl;
return 1;
}
else if (secondary_filter != filter)
{
std::cout << "Error - Persisted filter and original filter do not match!" << std::endl;
return 1;
}
else
{
unsigned char* buf_itr = buffer;
const unsigned char* buf_end = buffer + buffer_size;
unsigned int failures = 0;
while (buf_end != buf_itr)
{
if (!secondary_filter.contains(buf_itr,k))
{
std::cout << "Error: Failed to find: " << std::string(buf_itr,buf_itr + k) << " in secondary filter!" << std::endl;
++failures;
}
buf_itr += k;
}
if (0 == failures)
{
std::cout << "Successfully replicated bloom filter." << std::endl;
}
}
}
delete [] buffer;
return 0;
}