Bachelors_Thesis_Code/helper_code/fagprojekt_code/ex53.cpp
Knyffen 03ec008d0d A bit of cleanup
Move some helper files to other folders.

Actually remove the .sage files from the main directory, as I wrote I
would in the readme.
2021-11-14 14:48:58 +01:00

176 lines
6.4 KiB
C++

/* #define NDEBUG */
#include "hash_table.hpp"
#include "hashing_algorithms.hpp"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <chrono>
#include <iostream>
#include <bitset>
#include <fstream>
#include <ctype.h>
#include <vector>
#include <assert.h>
using namespace std;
const int max_string_length = 256;
const int D = (max_string_length+7) >> 3; // D = ceil(max_string_length/size(char))
My_string * read_word(const char * book, size_t * reading_progress);
int main() {
const clock_t clock_before_seed = clock();
// load seed generated by random.org
string filename = "random_org_16_bit_numbers.txt";
std::ifstream seed_file(filename);
uint16_t seed_part;
uint16_t seed_parts[D << 2];
size_t count = 0;
while (seed_file >> seed_part) {
seed_parts[count] = seed_part;
count++;
if (count == D << 2)
break; // the seed is large enough for the max_string_length
}
if (count != D << 2) {
cout << "The current seed is not large enough. Please extend it by appending the numbers at https://www.random.org/integers/?num=10000&min=0&max=65535&col=1&base=10&format=plain&rnd=new." << endl;
return EXIT_SUCCESS;
}
// Get 64 bit seed from 16 bit seed
uint64_t seed[D];
memcpy(seed, seed_parts, D*sizeof(seed[0]));
const clock_t clock_before_hash_table = clock();
size_t l = 1; // initially, hash to a table of up to 1024 distinct words (2^10)
Hash_table ht = Hash_table(new Hash_function(seed, multiply_shift_string, l, D));
const clock_t clock_before_loading_book = clock();
// choose a book
/* std::ifstream ifs("genji_monogatari_english.txt"); */
/* std::ifstream ifs("Child_of_Light.txt"); */
/* std::ifstream ifs("the_adventures_of_sherlock_holmes.txt"); */
/* std::ifstream ifs("dracula.txt"); */
std::ifstream ifs("the_complete_works_of_william_shakespeare.txt");
string book_string( (std::istreambuf_iterator<char>(ifs) ),
(std::istreambuf_iterator<char>() ) );
const char * book = book_string.c_str();
const size_t book_length = book_string.size();
const clock_t clock_before_reading = clock();
std::vector<My_string *> words;
size_t reading_progress = 0;
while (reading_progress < book_length-1) { // book_length includes '\0' which reading_progress avoids
My_string * word = read_word(book, &reading_progress);
if (word->size > 0)
words.push_back(word);
}
const clock_t clock_after_reading = clock();
size_t count_words = 0;
for (My_string * word : words) {
count_words++;
ht.hash(word);
if (ht.is_time_for_rehash()) {
l++; // double the universe which we hashes to
ht.rehash(new Hash_function(seed, multiply_shift_string, l, D));
}
}
const clock_t clock_after_hashing = clock();
Hash_function hf = Hash_function(seed, multiply_shift_string, 10, D);
const clock_t clock_after_init_hash_function = clock();
uint64_t hashed_value = 0;
for (My_string * word : words) {
/* hf.hash(word); */
hashed_value += hf.hash(word);
}
const clock_t clock_after_hashing_only = clock();
cout << "Sum of the hashed values (after overflow): " << hashed_value << endl;
cout << "Nr of words: " << count_words << endl;
cout << "Distinct words: " << ht.get_distict_words() << endl;
cout << "Time: Load seed: " << float( clock_before_hash_table - clock_before_seed ) / CLOCKS_PER_SEC << endl;
cout << "Time: Init hash table: " << float( clock_before_loading_book - clock_before_hash_table ) / CLOCKS_PER_SEC << endl;
cout << "Time: Load book: " << float( clock_before_reading - clock_before_loading_book ) / CLOCKS_PER_SEC << endl;
cout << "Time: Read: " << float( clock_after_reading - clock_before_reading ) / CLOCKS_PER_SEC << endl;
cout << "Time: Hash & Table: " << float( clock_after_hashing - clock_after_reading ) / CLOCKS_PER_SEC << endl;
cout << "Time: Total: " << float( clock_after_hashing - clock_before_seed ) / CLOCKS_PER_SEC << endl;
cout << "Time: Init hash function: " << float( clock_after_init_hash_function - clock_after_hashing ) / CLOCKS_PER_SEC << endl;
cout << "Time: Hash, no table: " << float( clock_after_hashing_only - clock_after_init_hash_function ) / CLOCKS_PER_SEC << endl;
for (My_string * word : words)
delete word;
// BONUS: Polynomium hashing
uint32_t result_un;
size_t book_as_integer_length = (book_length+3)>>2; // ceil(book_length/4)
uint32_t book_as_integer[book_as_integer_length];
book_as_integer[book_as_integer_length-1] = 0;
memcpy(book_as_integer, book, book_length*sizeof(book[0]));
uint32_t a[3] = {12313,2212312,123332};
uint32_t b[3] = {5345,213213,123123};
uint32_t c[3] = {3231,213144,450022};
const clock_t begin_time_pol = clock();
result_un = polynomial_vector(book_as_integer, a, b, c, book_as_integer_length >> 2, 32);
const clock_t end_time_pol = clock();
cout << "polynomium hash function time: " << float( end_time_pol - begin_time_pol ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result_un << endl;
const clock_t begin_time_pol_tun = clock();
result_un = polynomial_vector_tuned(book_as_integer, a, b, c, book_as_integer_length >> 2, 32, seed);
const clock_t end_time_pol_tun = clock();
cout << "tuned polynomium hash function time: " << float( end_time_pol_tun - begin_time_pol_tun ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result_un << endl;
return EXIT_SUCCESS;
}
My_string * read_word(const char * book, size_t * reading_progress) {
bool word_started = false;
char word[max_string_length];
size_t word_length = 0;
char c;
while (book[*reading_progress+1] != '\0') {
(*reading_progress)++;
c = book[*reading_progress];
if (word_started) {
if (isalnum(c)) {
if (word_length != max_string_length) { // crop words longer than the max_string_length
word[word_length] = tolower(c);
word_length++;
}
} else {
return new My_string(word, word_length);
}
} else if (isalnum(c)) {
word_started = true;
word[word_length] = c;
word_length++;
}
}
return new My_string(word, word_length);
}