176 lines
6.4 KiB
C++
176 lines
6.4 KiB
C++
|
/* #define NDEBUG */
|
||
|
|
||
|
#include "hash_table.hpp"
|
||
|
#include "hashing_algorithms.hpp"
|
||
|
|
||
|
#include <stdlib.h>
|
||
|
#include <stdio.h>
|
||
|
#include <string.h>
|
||
|
#include <chrono>
|
||
|
#include <iostream>
|
||
|
#include <bitset>
|
||
|
#include <fstream>
|
||
|
#include <ctype.h>
|
||
|
#include <vector>
|
||
|
|
||
|
#include <assert.h>
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
|
||
|
const int max_string_length = 256;
|
||
|
const int D = (max_string_length+7) >> 3; // D = ceil(max_string_length/size(char))
|
||
|
|
||
|
My_string * read_word(const char * book, size_t * reading_progress);
|
||
|
|
||
|
int main() {
|
||
|
const clock_t clock_before_seed = clock();
|
||
|
// load seed generated by random.org
|
||
|
string filename = "random_org_16_bit_numbers.txt";
|
||
|
std::ifstream seed_file(filename);
|
||
|
uint16_t seed_part;
|
||
|
uint16_t seed_parts[D << 2];
|
||
|
size_t count = 0;
|
||
|
while (seed_file >> seed_part) {
|
||
|
seed_parts[count] = seed_part;
|
||
|
count++;
|
||
|
if (count == D << 2)
|
||
|
break; // the seed is large enough for the max_string_length
|
||
|
}
|
||
|
if (count != D << 2) {
|
||
|
cout << "The current seed is not large enough. Please extend it by appending the numbers at https://www.random.org/integers/?num=10000&min=0&max=65535&col=1&base=10&format=plain&rnd=new." << endl;
|
||
|
return EXIT_SUCCESS;
|
||
|
}
|
||
|
// Get 64 bit seed from 16 bit seed
|
||
|
uint64_t seed[D];
|
||
|
memcpy(seed, seed_parts, D*sizeof(seed[0]));
|
||
|
|
||
|
const clock_t clock_before_hash_table = clock();
|
||
|
|
||
|
size_t l = 1; // initially, hash to a table of up to 1024 distinct words (2^10)
|
||
|
Hash_table ht = Hash_table(new Hash_function(seed, multiply_shift_string, l, D));
|
||
|
|
||
|
const clock_t clock_before_loading_book = clock();
|
||
|
|
||
|
// choose a book
|
||
|
/* std::ifstream ifs("genji_monogatari_english.txt"); */
|
||
|
/* std::ifstream ifs("Child_of_Light.txt"); */
|
||
|
/* std::ifstream ifs("the_adventures_of_sherlock_holmes.txt"); */
|
||
|
/* std::ifstream ifs("dracula.txt"); */
|
||
|
std::ifstream ifs("the_complete_works_of_william_shakespeare.txt");
|
||
|
|
||
|
string book_string( (std::istreambuf_iterator<char>(ifs) ),
|
||
|
(std::istreambuf_iterator<char>() ) );
|
||
|
const char * book = book_string.c_str();
|
||
|
const size_t book_length = book_string.size();
|
||
|
|
||
|
const clock_t clock_before_reading = clock();
|
||
|
|
||
|
std::vector<My_string *> words;
|
||
|
|
||
|
size_t reading_progress = 0;
|
||
|
while (reading_progress < book_length-1) { // book_length includes '\0' which reading_progress avoids
|
||
|
My_string * word = read_word(book, &reading_progress);
|
||
|
if (word->size > 0)
|
||
|
words.push_back(word);
|
||
|
}
|
||
|
|
||
|
const clock_t clock_after_reading = clock();
|
||
|
|
||
|
size_t count_words = 0;
|
||
|
for (My_string * word : words) {
|
||
|
count_words++;
|
||
|
ht.hash(word);
|
||
|
if (ht.is_time_for_rehash()) {
|
||
|
l++; // double the universe which we hashes to
|
||
|
ht.rehash(new Hash_function(seed, multiply_shift_string, l, D));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const clock_t clock_after_hashing = clock();
|
||
|
|
||
|
Hash_function hf = Hash_function(seed, multiply_shift_string, 10, D);
|
||
|
|
||
|
const clock_t clock_after_init_hash_function = clock();
|
||
|
|
||
|
uint64_t hashed_value = 0;
|
||
|
|
||
|
for (My_string * word : words) {
|
||
|
/* hf.hash(word); */
|
||
|
hashed_value += hf.hash(word);
|
||
|
}
|
||
|
|
||
|
const clock_t clock_after_hashing_only = clock();
|
||
|
|
||
|
cout << "Sum of the hashed values (after overflow): " << hashed_value << endl;
|
||
|
|
||
|
cout << "Nr of words: " << count_words << endl;
|
||
|
cout << "Distinct words: " << ht.get_distict_words() << endl;
|
||
|
|
||
|
cout << "Time: Load seed: " << float( clock_before_hash_table - clock_before_seed ) / CLOCKS_PER_SEC << endl;
|
||
|
cout << "Time: Init hash table: " << float( clock_before_loading_book - clock_before_hash_table ) / CLOCKS_PER_SEC << endl;
|
||
|
cout << "Time: Load book: " << float( clock_before_reading - clock_before_loading_book ) / CLOCKS_PER_SEC << endl;
|
||
|
cout << "Time: Read: " << float( clock_after_reading - clock_before_reading ) / CLOCKS_PER_SEC << endl;
|
||
|
cout << "Time: Hash & Table: " << float( clock_after_hashing - clock_after_reading ) / CLOCKS_PER_SEC << endl;
|
||
|
cout << "Time: Total: " << float( clock_after_hashing - clock_before_seed ) / CLOCKS_PER_SEC << endl;
|
||
|
cout << "Time: Init hash function: " << float( clock_after_init_hash_function - clock_after_hashing ) / CLOCKS_PER_SEC << endl;
|
||
|
cout << "Time: Hash, no table: " << float( clock_after_hashing_only - clock_after_init_hash_function ) / CLOCKS_PER_SEC << endl;
|
||
|
|
||
|
for (My_string * word : words)
|
||
|
delete word;
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
// BONUS: Polynomium hashing
|
||
|
uint32_t result_un;
|
||
|
size_t book_as_integer_length = (book_length+3)>>2; // ceil(book_length/4)
|
||
|
uint32_t book_as_integer[book_as_integer_length];
|
||
|
book_as_integer[book_as_integer_length-1] = 0;
|
||
|
memcpy(book_as_integer, book, book_length*sizeof(book[0]));
|
||
|
|
||
|
uint32_t a[3] = {12313,2212312,123332};
|
||
|
uint32_t b[3] = {5345,213213,123123};
|
||
|
uint32_t c[3] = {3231,213144,450022};
|
||
|
|
||
|
const clock_t begin_time_pol = clock();
|
||
|
result_un = polynomial_vector(book_as_integer, a, b, c, book_as_integer_length >> 2, 32);
|
||
|
const clock_t end_time_pol = clock();
|
||
|
cout << "polynomium hash function time: " << float( end_time_pol - begin_time_pol ) / CLOCKS_PER_SEC << endl;
|
||
|
cout << "Result: " << result_un << endl;
|
||
|
|
||
|
const clock_t begin_time_pol_tun = clock();
|
||
|
result_un = polynomial_vector_tuned(book_as_integer, a, b, c, book_as_integer_length >> 2, 32, seed);
|
||
|
const clock_t end_time_pol_tun = clock();
|
||
|
cout << "tuned polynomium hash function time: " << float( end_time_pol_tun - begin_time_pol_tun ) / CLOCKS_PER_SEC << endl;
|
||
|
cout << "Result: " << result_un << endl;
|
||
|
|
||
|
return EXIT_SUCCESS;
|
||
|
}
|
||
|
|
||
|
My_string * read_word(const char * book, size_t * reading_progress) {
|
||
|
bool word_started = false;
|
||
|
char word[max_string_length];
|
||
|
size_t word_length = 0;
|
||
|
char c;
|
||
|
while (book[*reading_progress+1] != '\0') {
|
||
|
(*reading_progress)++;
|
||
|
c = book[*reading_progress];
|
||
|
if (word_started) {
|
||
|
if (isalnum(c)) {
|
||
|
if (word_length != max_string_length) { // crop words longer than the max_string_length
|
||
|
word[word_length] = tolower(c);
|
||
|
word_length++;
|
||
|
}
|
||
|
} else {
|
||
|
return new My_string(word, word_length);
|
||
|
}
|
||
|
} else if (isalnum(c)) {
|
||
|
word_started = true;
|
||
|
word[word_length] = c;
|
||
|
word_length++;
|
||
|
}
|
||
|
}
|
||
|
return new My_string(word, word_length);
|
||
|
}
|