Init (add codebase)

This commit is contained in:
Knyffen 2021-11-14 14:35:05 +01:00
parent d10424819c
commit bacd7cff71
43 changed files with 174555 additions and 2 deletions

48
Makefile Normal file
View File

@ -0,0 +1,48 @@
CC = cc # C compiler
# cc and gcc is the same
CXX = c++ # C++ compiler
# c++
# g++
# clang++
CPPFLAGS = # preprocessor flags
CFLAGS = -Wall -Wfloat-equal -std=c99 -O0 -mtune=native -funroll-loops $(OPT) # C compiler flags
# -Wall: Warnings all
# -Wfloat-equal: Warn if comparing floats with ==
# -OX: Optimization level -O0 (none, default), -O1, -O2, -O3 (most, may increase file size), -Os (decrease file size), -ffast-math
# -mtune: Tune for specific/generic CPU
# -funrool-loops: Unroll loops when compiling
# -fopenmp: Enable OpenMP (When running, set the environmental variable OMP_NUM_THREADS)
# $(OPT): Passes any parameter given when calling make with 'make OPT=...'
CXXFLAGS = -Wall -std=c++20 -O0 # C++ compiler flags
LDFLAGS = # linker flags
LDLIBS = # library flags
# -lm (for <math.h>)
# -lblas (BASIC LINEAR ALGEBRA SUBPROGRAMS)
# -lcblas (for <cblas.h>)
# -lopenblas (NOT INSTALLED; blas and cblas in one; conflicts with blas and cblas)
# -llapack (LAPACKE)
# -llapacke (LAPACK) NOTE: The order of "-llapacke -llapack -lblas" is very important
#LINK.o = $(CXX) $(LDFLAGS) # use CXX for linking
simple_string_matching: simple_string_matching.o Rabin_fingerprint.o general_library.o
$(CXX) $(CXXFLAGS) simple_string_matching.o Rabin_fingerprint.o general_library.o -o simple_string_matching
simple_string_matching.o: simple_string_matching.cpp
$(CXX) $(CXXFLAGS) -c simple_string_matching.cpp
Rabin_fingerprint.o: Rabin_fingerprint.cpp Rabin_fingerprint.hpp
$(CXX) $(CXXFLAGS) -c Rabin_fingerprint.cpp
general_library.o: general_library.cpp general_library.hpp
$(CXX) $(CXXFLAGS) -c general_library.cpp
porat-porat: porat-porat.cpp
# Tell the compiler that 'clean' isn't referring to a file
.PHONY: clean
# A make target that cleans (by deleting files)
clean:
$(RM) porat-porat
$(RM) simple_string_matching
$(RM) *.o

View File

@ -1,3 +1,33 @@
# Bachelors_Thesis_Code
# Bachelor's_Thesis_Code
Code used for my Bachelor's Thesis
# Versions
## Current Version
Compared to V1, this current state `generate_initial_irreducible_polynomials.sage`, `generate_random_irreducible_polynomial.sage`, and `multiply_polynomials_modulo_polynomial.sage` have been removed from the main folder, as they have already been used for implementing their corresponding features in `general_library.cpp`. They will be preserved in the V1 folder.
This version also contains code relating to Porat-Porat and other random code stumps. This code is going to be completely reimplemented, but is preserved for now to avoid redoing work.
## V1
#### `general_library.cpp`
- Exports functions for:
- Generating a random irreducible polynomial of degree up to 31.
- Contains code for:
- Contains code to multiply polynomials modulo another polynomial.
- Doesn't contain code to initially calculate modulo a polynomial. It requires the initial polynomials to all be at most of the same degree as the polynomial that you use for modulo.
- A very custom version of calculating Reduced Row Echelon Form assuming all values are in Z2.
#### `generate_initial_irreducible_polynomials.sage`
- Generates a list of irreducible polynomials, one of each degree. The polynomials are printed in a format that can be copy-pasted directly into `general_library.cpp`.
#### `generate_random_irreducible_polynomial.sage`
- Contains an general template of how `general_library.cpp` should implement generating a random irreducible polynomial. The output of this program can also be used for debugging, since it uses many (correctly implemented) built-in sagemath functions that have to manually implemented in `general_library.cpp`.
#### `multiply_polynomials_modulo_polynomial.sage`
- Contains a general template of how `general_library.cpp` should implement multiplying polynomials modulo a polynomial in Z2.
#### `compare_fingerprint_false_positive_probabilities.sage`
- Contains initial (i.e. very rouch) code, which compares the upper bounds of a false match occuring of the Rabin fingerprint and the fingerprint used in Karp-Rabin.
#### `test_rabin_fingerprint.sage`
- Contains initial (i.e. very rough) code which debunks my theory that modulo a prime is somewhat equivalent to modulo an irreducible polynomial.

73
Rabin_fingerprint.cpp Normal file
View File

@ -0,0 +1,73 @@
#include "Rabin_fingerprint.hpp"
Rabin_fingerprint::Rabin_fingerprint(uint32_t p, size_t window_size_in_bits) {
set_modulo_polynomial(p);
set_shift_polynomial(window_size_in_bits);
}
void Rabin_fingerprint::set_modulo_polynomial (uint32_t p) {
polynomial = p & ((uint32_t)pow(2, 31)-1);
}
void Rabin_fingerprint::set_shift_polynomial (size_t window_size_in_bits) {
#ifndef NDEBUG
if (polynomial == 0)
throw std::logic_error("Call set_modulo_polynomial first, as this function depends on the polynomial variable being set.");
#endif
shift_polynomial = 1;
// NOTE: We shift the bit 1 space too long, since we are removing the bit that has been pushed outside the window
for (size_t i = 0; i < window_size_in_bits; i++) {
shift_polynomial <<= 1;
if ((shift_polynomial & (uint32_t)pow(2, 31)) != 0)
shift_polynomial ^= polynomial;
}
}
void Rabin_fingerprint::push_char (char c) {
std::bitset<8> b(c);
for (char i = 7; i >= 0; i--) {
push_bit((bool)b[i]);
}
}
void Rabin_fingerprint::push_bit (bool b) {
fingerprint <<= 1;
fingerprint |= b;
if ((fingerprint & (uint32_t)pow(2, 31)) != 0)
fingerprint ^= polynomial;
}
void Rabin_fingerprint::shift_bit (bool b) {
#ifndef NDEBUG
if (shift_polynomial == 0)
throw std::logic_error("Call set_shift_polynomial first, as this function depends on the shift_polynomial variable being set.");
#endif
if (b)
fingerprint ^= shift_polynomial;
}
void Rabin_fingerprint::slide_char (char c_in, char c_out) {
std::bitset<8> b_in(c_in);
std::bitset<8> b_out(c_out);
for (char i = 7; i >= 0; i--) {
slide_bit((bool)b_in[i], (bool)b_out[i]);
}
}
/* #include <iostream> */
void Rabin_fingerprint::slide_bit (bool b_in, bool b_out) {
/* std::cout << "bitset b(c): " << b << std::endl; */
/* std::cout << "push bit " << b[i] << std::endl; */
/* std::cout << "push bit " << b_in << std::endl; */
push_bit(b_in);
/* std::cout << "shift bit " << b_out << std::endl; */
shift_bit(b_out);
}
uint32_t Rabin_fingerprint::get_fingerprint () {
return fingerprint;
}

30
Rabin_fingerprint.hpp Normal file
View File

@ -0,0 +1,30 @@
#ifndef RABIN_FINGERPRINT_H
#define RABIN_FINGERPRINT_H
#include <stdint.h>
#include <bitset>
#include <math.h>
#include <stdexcept>
class Rabin_fingerprint {
public:
Rabin_fingerprint(uint32_t polynomial, size_t window_size_in_bits);
void push_char (char c);
void push_bit (bool b);
void shift_bit (bool b);
void slide_char (char c_in, char c_out);
void slide_bit (bool b1, bool b2);
uint32_t get_fingerprint();
private:
void set_modulo_polynomial (uint32_t p);
void set_shift_polynomial (size_t window_size_in_bits);
uint32_t fingerprint = 0;
uint32_t polynomial = 0;
uint32_t shift_polynomial = 0;
};
#endif

46
V1/Makefile Normal file
View File

@ -0,0 +1,46 @@
CC = cc # C compiler
# cc and gcc is the same
CXX = c++ # C++ compiler
# c++
# g++
# clang++
CPPFLAGS = # preprocessor flags
CFLAGS = -Wall -Wfloat-equal -std=c99 -O0 -mtune=native -funroll-loops $(OPT) # C compiler flags
# -Wall: Warnings all
# -Wfloat-equal: Warn if comparing floats with ==
# -OX: Optimization level -O0 (none, default), -O1, -O2, -O3 (most, may increase file size), -Os (decrease file size), -ffast-math
# -mtune: Tune for specific/generic CPU
# -funrool-loops: Unroll loops when compiling
# -fopenmp: Enable OpenMP (When running, set the environmental variable OMP_NUM_THREADS)
# $(OPT): Passes any parameter given when calling make with 'make OPT=...'
CXXFLAGS = -Wall -std=c++20 -O0 # C++ compiler flags
LDFLAGS = # linker flags
LDLIBS = # library flags
# -lm (for <math.h>)
# -lblas (BASIC LINEAR ALGEBRA SUBPROGRAMS)
# -lcblas (for <cblas.h>)
# -lopenblas (NOT INSTALLED; blas and cblas in one; conflicts with blas and cblas)
# -llapack (LAPACKE)
# -llapacke (LAPACK) NOTE: The order of "-llapacke -llapack -lblas" is very important
#LINK.o = $(CXX) $(LDFLAGS) # use CXX for linking
simple_string_matching: simple_string_matching.o Rabin_fingerprint.o general_library.o
$(CXX) $(CXXFLAGS) simple_string_matching.o Rabin_fingerprint.o general_library.o -o simple_string_matching
simple_string_matching.o: simple_string_matching.cpp
$(CXX) $(CXXFLAGS) -c simple_string_matching.cpp
Rabin_fingerprint.o: Rabin_fingerprint.cpp Rabin_fingerprint.hpp
$(CXX) $(CXXFLAGS) -c Rabin_fingerprint.cpp
general_library.o: general_library.cpp general_library.hpp
$(CXX) $(CXXFLAGS) -c general_library.cpp
# Tell the compiler that 'clean' isn't referring to a file
.PHONY: clean
# A make target that cleans (by deleting files)
clean:
$(RM) simple_string_matching
$(RM) *.o

68
V1/Rabin_fingerprint.cpp Normal file
View File

@ -0,0 +1,68 @@
#include "Rabin_fingerprint.hpp"
Rabin_fingerprint::Rabin_fingerprint(uint32_t p, size_t window_size_in_bits) {
set_modulo_polynomial(p);
set_shift_polynomial(window_size_in_bits);
}
void Rabin_fingerprint::set_modulo_polynomial (uint32_t p) {
polynomial = p & ((uint32_t)pow(2, 31)-1);
}
void Rabin_fingerprint::set_shift_polynomial (size_t window_size_in_bits) {
#ifndef NDEBUG
if (polynomial == 0)
throw std::logic_error("Call set_modulo_polynomial first, as this function depends on the polynomial variable being set.");
#endif
shift_polynomial = 1;
// NOTE: We shift the bit 1 space too long, since we are removing the bit that has been pushed outside the window
for (size_t i = 0; i < window_size_in_bits; i++) {
shift_polynomial <<= 1;
if ((shift_polynomial & (uint32_t)pow(2, 31)) != 0)
shift_polynomial ^= polynomial;
}
}
void Rabin_fingerprint::push_char (char c) {
std::bitset<8> b(c);
for (char i = 7; i >= 0; i--) {
push_bit((bool)b[i]);
}
}
void Rabin_fingerprint::push_bit (bool b) {
fingerprint <<= 1;
fingerprint |= b;
if ((fingerprint & (uint32_t)pow(2, 31)) != 0)
fingerprint ^= polynomial;
}
void Rabin_fingerprint::shift_bit (bool b) {
#ifndef NDEBUG
if (shift_polynomial == 0)
throw std::logic_error("Call set_shift_polynomial first, as this function depends on the shift_polynomial variable being set.");
#endif
if (b)
fingerprint ^= shift_polynomial;
}
void Rabin_fingerprint::slide_char (char c_in, char c_out) {
std::bitset<8> b_in(c_in);
std::bitset<8> b_out(c_out);
for (char i = 7; i >= 0; i--) {
slide_bit((bool)b_in[i], (bool)b_out[i]);
}
}
void Rabin_fingerprint::slide_bit (bool b_in, bool b_out) {
push_bit(b_in);
shift_bit(b_out);
}
uint32_t Rabin_fingerprint::get_fingerprint () {
return fingerprint;
}

30
V1/Rabin_fingerprint.hpp Normal file
View File

@ -0,0 +1,30 @@
#ifndef RABIN_FINGERPRINT_H
#define RABIN_FINGERPRINT_H
#include <stdint.h>
#include <bitset>
#include <math.h>
#include <stdexcept>
class Rabin_fingerprint {
public:
Rabin_fingerprint(uint32_t polynomial, size_t window_size_in_bits);
void push_char (char c);
void push_bit (bool b);
void shift_bit (bool b);
void slide_char (char c_in, char c_out);
void slide_bit (bool b1, bool b2);
uint32_t get_fingerprint();
private:
void set_modulo_polynomial (uint32_t p);
void set_shift_polynomial (size_t window_size_in_bits);
uint32_t fingerprint = 0;
uint32_t polynomial = 0;
uint32_t shift_polynomial = 0;
};
#endif

View File

@ -0,0 +1,73 @@
# Rabin fingerprint: k > lg(n*m/e), e is the upper bound on the error probability
# Karp-Rabin fingerprint: <= pi(n(n-m+1))/pi(M)
# 32-bit
M = 2**32-1
k = 31
# 16-bit
# M = 2**16-1
# k = 13
def fpi(a):
p = Primes()
a_next = p.next(a)
i = 0
jump = 1
lastdirection = -2
while(True):
print(f'{jump:010d}', end='\r')
# print(f'{i}, {jump}')
val = p.unrank(i)
if val == a_next:
return i # i-1, is the value we search for, but we count from 0, so i is correct
elif val > a_next:
if lastdirection == 2:
# print('mul')
lastdirection -= 1
jump *= 2
else:
# print('div')
jump //= 2
lastdirection += 1
# print(jump)
i -= jump
else:
if lastdirection == -2:
# print('mul')
lastdirection += 1
jump *= 2
else:
# print('div')
jump //= 2
lastdirection -= 1
i += jump
# Rabin fingerprint
# k = lg(n*m/e) <=>
# 2^k = n*m/e <=>
# 2^k/(n*m) = 1/e <=>
# n*m/(2^k) = e
# Karp-Rabin fingerprint
# e = pi(n(n-m+1))/pi(M)
# print(pi(2**32-1))
# e_rabin = n*m/(2^k)
piM = fpi(M)
for m in [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]:
for n in [10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000]:
if m>=n:
continue
print(f'(n, m) = ({n}, {m})')
e_rabin = n*m/(2**k)
print(f'Rabin: {float(e_rabin):.2e}')
e_karpr = fpi(n*(n-m+1))/piM
print(f'KarpR: {float(e_karpr):.2e}')
print()
print(f'{e_karpr} >= 1 is {e_karpr >= 1} | {e_rabin} >= 1 is {e_rabin >= 1}')
if float(e_karpr) >= 1 or float(e_rabin) >= 1:
break

184
V1/general_library.cpp Normal file
View File

@ -0,0 +1,184 @@
#include "general_library.hpp"
// SECTION: Functions related to calculating with polynomials in Z2[x]
uint32_t irreducible_polynomials[] {
0b11, // irreducible polynomial of degree 1
0b111, // irreducible polynomial of degree 2
0b1011, // irreducible polynomial of degree 3
0b10011, // irreducible polynomial of degree 4
0b100101, // irreducible polynomial of degree 5
0b1011011, // irreducible polynomial of degree 6
0b10000011, // irreducible polynomial of degree 7
0b100011101, // irreducible polynomial of degree 8
0b1000010001, // irreducible polynomial of degree 9
0b10001101111, // irreducible polynomial of degree 10
0b100000000101, // irreducible polynomial of degree 11
0b1000011101011, // irreducible polynomial of degree 12
0b10000000011011, // irreducible polynomial of degree 13
0b100000010101001, // irreducible polynomial of degree 14
0b1000000000110101, // irreducible polynomial of degree 15
0b10000000000101101, // irreducible polynomial of degree 16
0b100000000000001001, // irreducible polynomial of degree 17
0b1000001010000000011, // irreducible polynomial of degree 18
0b10000000000000100111, // irreducible polynomial of degree 19
0b100000000011011110011, // irreducible polynomial of degree 20
0b1000000000000001100101, // irreducible polynomial of degree 21
0b10000000001111101100001, // irreducible polynomial of degree 22
0b100000000000000000100001, // irreducible polynomial of degree 23
0b1000000011110011010101001, // irreducible polynomial of degree 24
0b10000000000000000101000101, // irreducible polynomial of degree 25
0b100000000000100010111010011, // irreducible polynomial of degree 26
0b1000000000000001011010101101, // irreducible polynomial of degree 27
0b10000000000000010000011100101, // irreducible polynomial of degree 28
0b100000000000000000000000000101, // irreducible polynomial of degree 29
0b1000000000000110010100010101111, // irreducible polynomial of degree 30
0b10000000000000000000000000001001 // irreducible polynomial of degree 31
};
size_t degree (uint32_t polynomial) {
return 31 - __builtin_clz(polynomial);
}
uint32_t multiply_modulo_polynomials_in_Z2 (uint32_t q1, uint32_t q2, uint32_t p) {
size_t p_degree = degree(p);
size_t q1_degree = degree(q1);
size_t q2_degree = degree(q2);
if (q1_degree > p_degree)
throw std::logic_error("We only support multiply-modulo whenever the multiplied polynomials are initially at most the same degree as the modulo polynomial.");
if (q1_degree == p_degree)
q1 ^= p;
uint32_t arr[q2_degree+1];
arr[0] = q1;
for (size_t i = 1; i <= q2_degree; i++) {
arr[i] = arr[i-1]<<1;
if (degree(arr[i]) == p_degree)
arr[i] ^= p;
}
uint32_t sum = 0;
std::bitset<32> b(q2);
for (size_t i = 0; i <= q2_degree; i++) {
if (b[i] == 1)
sum ^= arr[i];
}
return sum;
}
uint32_t polynomial_power_in_Z2(uint32_t q, uint32_t exp, uint32_t p) {
if (exp == 0)
return 1;
if (degree(q) == degree(p))
q ^= p;
size_t log2exp = degree(exp) + 1;
uint32_t arr[log2exp];
arr[0] = q;
for (size_t i = 1; i < log2exp; i++)
arr[i] = multiply_modulo_polynomials_in_Z2(arr[i-1], arr[i-1], p);
uint32_t res = 1;
std::bitset<32> b(exp);
for (size_t i = 0; i <= log2exp; i++) {
if (b[i] == 1)
res = multiply_modulo_polynomials_in_Z2(res, arr[i], p);
}
return res;
}
void rref_Z2(std::vector<std::bitset<32>> &A, size_t m, size_t n) {
if (m > 31 || n > 32)
throw std::logic_error("We only support rref_Z2 for up to a (31x32)-matrix.");
// calculate row echelon form
// https://en.wikipedia.org/wiki/Gaussian_elimination#Pseudocode
size_t h = 0; // Initialization of the pivot row
size_t k = 0; // Initialization of the pivot column
while (h < m && k < n) {
// Find the k-th pivot:
int i_max = -1;
for (size_t i = h; i < m; i++) {
if (A[i][k]) {
i_max = i;
break;
}
}
if (i_max == -1) {
// No pivot in this column, pass to next column
k++;
} else {
auto tmp = A[i_max];
A[i_max] = A[h];
A[h] = tmp;
// Do for all rows below pivot:
for (size_t i = h+1; i < m; i++){
bool f = A[i][k];
if (f == 0)
continue;
// Fill with zeros the lower part of pivot column:
A[i][k] = 0;
// Do for all remaining elements in current row:
for (size_t j = k+1; j<n; j++) {
A[i][j] = A[i][j] ^ A[h][j];
}
}
// Increase pivot row and column
h++;
k++;
}
}
// Perform back substitution
for (size_t i = 1; i < m; i++) {
for (size_t j = 0; j < i; j++) {
if (A[j][i] == 1) {
for (size_t l = i; l < n; l++) // check this
A[j][l] = A[j][l] ^ A[i][l];
}
}
}
}
uint32_t get_random_irreducible_polynomial_in_Z2 (size_t k) {
if (k > 31)
throw std::logic_error("We only support polynomials of degree 31 or less.");
uint32_t init_p = irreducible_polynomials[k-1];
srand(time(0));
uint32_t gamma = (rand() % (((uint32_t)1 << (k+1)) - 2)) + 2; // gamma in GF(2^k)\GF(2)
uint32_t gamma_pow[k+1];
for (size_t i = 0; i <= k; i++)
gamma_pow[i] = polynomial_power_in_Z2(gamma, i, init_p);
// Create an array of the polynomials in gamma_pow, and transpose it at the same time
std::vector<std::bitset<32>> A(k); // NOTE: That 32 > 31 >= k
for (size_t i = 0; i < k; i++) {
for (size_t j = 0; j <= k; j++) {
A[i][k-j] = gamma_pow[j] & ((uint32_t)1<<i);
}
}
rref_Z2(A, k, k+1);
std::bitset<32> bp;
for (size_t i = 0; i < k; i++) {
bp[i] = A[i][k];
}
bp[k] = 1;
uint32_t p = (uint32_t)bp.to_ulong();
return p;
}

14
V1/general_library.hpp Normal file
View File

@ -0,0 +1,14 @@
#ifndef GENERAL_LIBRARY_H
#define GENERAL_LIBRARY_H
#include <stdlib.h>
#include <stdint.h>
#include <stdexcept>
#include <vector>
#include <bitset>
#include <iostream>
// SECTION: Functions related to calculating with polynomials in Z2[x]
uint32_t get_random_irreducible_polynomial_in_Z2 (size_t k);
#endif

View File

@ -0,0 +1,12 @@
F = GF(2)
PR = PolynomialRing(F, 'x')
max_k = 31
whitespace = max_k+1
for i in range(1, max_k+1):
poly_int = '0b'+ ''.join(map(str, PR.irreducible_element(i).coefficients(sparse=False)[::-1]))
poly_int += ','
poly_int += ' '*(max_k-i)
print(f' {poly_int}// irreducible polynomial of degree {i}')

View File

@ -0,0 +1,96 @@
k = 31
F = GF(2)
E = GF(2^k)
PR = PolynomialRing(F, 'x')
p = PR.irreducible_element(k)
while(True):
gamma = E.random_element()
if gamma not in F:
break
# gamma = PR('x^3+x')
# print('0b'+ ''.join(map(str, gamma.coefficients(sparse=False)[::-1])))
eqs = [PR(gamma)^i % p for i in range(k+1)]
for eq in eqs:
print(eq)
print()
eqs = list(map(lambda eq: eq.coefficients(sparse=False) + [0]*(k-eq.degree()-1), eqs))
for eq in eqs:
print(eq)
print()
eqs = matrix(eqs[::-1])
print(eqs)
print()
eqs = eqs.transpose()
print(eqs)
print()
rref = eqs.rref()
print(rref)
irrpol = PR(rref[:,-1].list() + [1])
print(irrpol)
print(irrpol.is_irreducible())
print()
# manual rref
def calc_rref_Z2(A, m, n):
"""Calculate the reduced row echelon form of a (mxn)-matrix"""
A = copy(A)
# calculate row echelon form
# https://en.wikipedia.org/wiki/Gaussian_elimination#Pseudocode
h = 0 # Initialization of the pivot row
k = 0 # Initialization of the pivot column
while h < m and k < n:
# Find the k-th pivot:
i_max = -1
for i in range(h, m):
if A[i,k] == 1:
i_max = i
break
if i_max == -1:
# No pivot in this column, pass to next column
k += 1
else:
tmp = A[i_max, :]
A[i_max, :] = A[h, :]
A[h, :] = tmp
# Do for all rows below pivot:
for i in range(h+1, m):
f = A[i][k];
if f == 0:
continue;
A[i, k] = 0;
for j in range(k+1, n):
A[i, j] = A[i, j] - A[h, j]
# Increase pivot row and column
h += 1
k += 1
# perform back substitution
for i in range(1, m):
for j in range(i):
if A[j, i] == 1:
A[j, :] -= A[i, :]
return A
print('Manual RREF:\n')
rref = calc_rref_Z2(eqs, k, k+1) # cols = rows+1, since we transposed the matrix
print(rref)
print()
irrpol = PR(rref[:,-1].list() + [1])
print(irrpol)
print(irrpol.is_irreducible())
print()
print('0b'+ ''.join(map(str, irrpol.coefficients(sparse=False)[::-1])))

View File

@ -0,0 +1,68 @@
import math
k = 31
F = GF(2)
var = 'x'
PR = PolynomialRing(F, var)
p = PR.irreducible_element(k)
gamma = PR.random_element(k)
exp = 10
target = gamma^exp % p
def polynomial_power_in_Z2(q, exp, p, var):
if exp == 0:
return 1
if q.degree() == p.degree():
q += p
res = q
for i in range(2, exp+1):
res = multiply_polynomials_in_Z2(res, q, p, var)
return res
def polynomial_power_in_Z2_V2(q, exp, p, var):
if exp == 0:
return 1
if q.degree() == p.degree():
q += p
log2deg = math.floor(math.log2(exp)) + 1
arr = [0]*log2deg
arr[0] = q
for i in range(1, log2deg):
arr[i] = multiply_polynomials_in_Z2(arr[i-1], arr[i-1], p, var)
res = PR(1)
for i, b in enumerate(bin(exp)[:1:-1]):
if b == '1':
res = multiply_polynomials_in_Z2(res, arr[i], p, var)
return res
def multiply_polynomials_in_Z2(q1, q2, p, var):
if q1.degree() > p.degree():
raise ValueError('Unsupported!')
if q1.degree() == p.degree():
q1 += p
arr = [0]*(q2.degree()+1)
arr[0] = q1
for i in range(1, q2.degree()+1):
arr[i] = arr[i-1]*PR(var)
if arr[i].degree() == p.degree():
arr[i] += p
return sum(arr[i] for i in q2.exponents())
print(target)
res = polynomial_power_in_Z2(gamma, exp, p, var)
print(res)
res = polynomial_power_in_Z2_V2(gamma, exp, p, var)
print(res)

BIN
V1/simple_string_matching Executable file

Binary file not shown.

View File

@ -0,0 +1,65 @@
/* #define NDEBUG */
#include "Rabin_fingerprint.hpp"
#include "general_library.hpp"
#include <iostream>
#include <stdint.h>
#include <math.h>
#include <string>
#include <fstream>
void print_match (size_t index, size_t length, std::string &T) {
std::cout << "Match found at index " << index << " with the text \"";
for (size_t i = 0; i < length; i++)
std::cout << T[index + i];
std::cout << "\"" << std::endl;
}
int main() {
/* std::ifstream ifs("../books/the_complete_works_of_william_shakespeare.txt"); */
std::ifstream ifs("../books/genji_monogatari_english.txt");
std::string T( (std::istreambuf_iterator<char>(ifs) ),
(std::istreambuf_iterator<char>() ) );
/* std::string T = "Hello, this is my test string averylongword is a necessary word to exceed the 32 bit window."; */
// Test without the modulo polynomial - and two matches
std::string P = "word";
// Test with the modulo polynomial
/* std::string P = "averylongword"; */
std::cout << "Searching for pattern:" << std::endl;
std::cout << " " << P << std::endl;
/* std::cout << "in text:" << std::endl; */
/* std::cout << " " << T << std::endl; */
std::cout << std::endl;
/* uint32_t polynomial = pow(2, 30) + pow(2, 2) + 1; // x^31 + x^3 + 1 */
uint32_t polynomial = get_random_irreducible_polynomial_in_Z2(31);
/* uint32_t polynomial = 0b11010011100100000111101011110111; */
// Test without the modulo polynomial
size_t window_size_in_bits = P.length()*8;
// Hash the pattern
Rabin_fingerprint fP(polynomial, window_size_in_bits);
for (char c : P)
fP.push_char(c);
// Hash the text
Rabin_fingerprint fT(polynomial, window_size_in_bits);
for (size_t i = 0; i < P.length(); i++)
fT.push_char(T[i]);
if (fT.get_fingerprint() == fP.get_fingerprint())
print_match(0, P.length(), T);
for (size_t i = P.length(); i < T.length(); i++) {
fT.slide_char(T[i], T[i-P.length()]);
if (fT.get_fingerprint() == fP.get_fingerprint())
print_match(i-P.length()+1, P.length(), T);
}
std::cout << std::endl;
std::cout << "Done!" << std::endl;
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,97 @@
from random import randint
F = GF(2)
PR = PolynomialRing(F, 'x')
k = 31
P = PR.irreducible_element(k)
m = 100
# Test the calculation for moving the left edge of the sliding window
for _ in range(1):
q = tuple(randint(0, 1) for _ in range(m))
for i in range(1, m+1):
for j in range(i+1, m+1):
assert (PR(q[:i]) % P) == (PR(q) % P) - (PR(f'x^{i}')*PR(q[i:]) % P)
# Moving the right side of the window is as described in the Rabin Fingerprint article
# Next step (TODO)
# pattern = (1,0,1,1,1,1,0,1,1,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0,1)
# period = (1,0,1,1,1,1,0,1,1,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0)
# m = len(pattern)
# s_period = tuple(list(period)+[0]*(m-len(period)))
# pattern_1 = (1)
# period_1 = (1)
# s_period_1 = tuple(list(period_1)+[0]*(m-len(period_1)))
# pattern_2 = (1,0)
# period_2 = (1,0)
# s_period_2 = tuple(list(period_2)+[0]*(m-len(period_2)))
# pattern_4 = (1,0,1,1)
# period_4 = (1,0,1)
# s_period_4 = tuple(list(period_4)+[0]*(m-len(period_4)))
# pattern_8 = (1,0,1,1,1,1,0,1)
# period_8 = (1,0,1,1,1)
# s_period_8 = tuple(list(period_8)+[0]*(m-len(period_8)))
# pattern_16 = (1,0,1,1,1,1,0,1,1,0,1,0,1,0,1,1)
# period_16 = (1,0,1,1,1,1,0,1,1,0,1,0)
# s_period_16 = tuple(list(period_16)+[0]*(m-len(period_16)))
# for i in range(len(pattern)-len(pattern_4)-len(period_4)):
# assert PR(pattern[i+len(period_4):i+len(pattern_4)+len(period_4)]) == PR(pattern[i:i+len(pattern_4)] + pattern[i+len(pattern_4):i+len(pattern_4)+len(period_4)]) - PR(s_period_4)
# assert PR(pattern[i+len(period_4):i+len(pattern_4)+len(period_4)]) == pattern[i-len(period_4):i+len(pattern_4)-len(period_4)]+pattern[i+len(pattern_4)-len(period_4):i+len(pattern_4)-len(period_4)] - PR(s_period_4)
# Does the bits of a prime number correspond to the coefficients of an irreducible polynomial?
k = 31
reducible = 0
irreducible = 0
for p in Primes()[200000000:200010000]:#[100000:100000]:
# for p in range(2147483648,2147483648+10000):
poly = PR(tuple(bin(p)[2:][::-1]))
# print(f'{poly.degree()}')
if poly.degree() > k:
break
if poly.degree() != k:
continue
irr = poly.is_irreducible()
# print(f'The polynomial {poly}, corresponding to the prime {p}, being an irreducible polynomial is {irr}.')
if irr:
irreducible += 1
else:
reducible += 1
print(f'{irreducible}/{reducible+irreducible} of the primes correspond to irreducible polynomials. That is {float(irreducible/(reducible+irreducible))*100}%')
progress = 0
the_same = 0
different = 0
for p in Primes()[200000000:200010000]:
progress += 1
print(f'\r{progress}/10000', end='')
p_poly = PR(tuple(bin(p)[2:][::-1]))
if not p_poly.is_irreducible():
continue
# i = p + randint(1,2**29)
i = p + randint(1,p-1)
# print(f'{i}, {p}, {2*p}, {i%p}')
i_poly = PR(tuple(bin(i)[2:][::-1]))
i_mod_p = i % p
i_mod_p_poly = PR(tuple(bin(i_mod_p)[2:][::-1]))
# print(f'i_poly: {i_poly}')
# print(f'p_poly: {p_poly}')
# print(f'i_mod_p_poly: {i_mod_p_poly}')
if i_mod_p_poly == i_poly % p_poly:
the_same += 1
else:
different += 1
# if not i_mod_p_poly == i_poly - (p_poly - PR(f'x^{p_poly.degree()}')):
# if not i_mod_p_poly == i_poly % p_poly:
# print()
# print('ERROR')
# print(i_mod_p_poly)
# # print(i_poly - (p_poly - PR(f'x^{p_poly.degree()}')))
# print(i_poly % p_poly)
# break
# assert i_mod_p_poly == i_poly - (p_poly - PR(f'x^{p_poly.degree()}'))
print()
print(f'{the_same}/{the_same+different} are the same as if we did polynomial calculations. This is {float(the_same/(the_same+different))*100}%')

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,73 @@
# Rabin fingerprint: k > lg(n*m/e), e is the upper bound on the error probability
# Karp-Rabin fingerprint: <= pi(n(n-m+1))/pi(M)
# 32-bit
M = 2**32-1
k = 31
# 16-bit
# M = 2**16-1
# k = 13
def fpi(a):
p = Primes()
a_next = p.next(a)
i = 0
jump = 1
lastdirection = -2
while(True):
print(f'{jump:010d}', end='\r')
# print(f'{i}, {jump}')
val = p.unrank(i)
if val == a_next:
return i # i-1, is the value we search for, but we count from 0, so i is correct
elif val > a_next:
if lastdirection == 2:
# print('mul')
lastdirection -= 1
jump *= 2
else:
# print('div')
jump //= 2
lastdirection += 1
# print(jump)
i -= jump
else:
if lastdirection == -2:
# print('mul')
lastdirection += 1
jump *= 2
else:
# print('div')
jump //= 2
lastdirection -= 1
i += jump
# Rabin fingerprint
# k = lg(n*m/e) <=>
# 2^k = n*m/e <=>
# 2^k/(n*m) = 1/e <=>
# n*m/(2^k) = e
# Karp-Rabin fingerprint
# e = pi(n(n-m+1))/pi(M)
# print(pi(2**32-1))
# e_rabin = n*m/(2^k)
piM = fpi(M)
for m in [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]:
for n in [10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000]:
if m>=n:
continue
print(f'(n, m) = ({n}, {m})')
e_rabin = n*m/(2**k)
print(f'Rabin: {float(e_rabin):.2e}')
e_karpr = fpi(n*(n-m+1))/piM
print(f'KarpR: {float(e_karpr):.2e}')
print()
print(f'{e_karpr} >= 1 is {e_karpr >= 1} | {e_rabin} >= 1 is {e_rabin >= 1}')
if float(e_karpr) >= 1 or float(e_rabin) >= 1:
break

175
ex53.cpp Normal file
View File

@ -0,0 +1,175 @@
/* #define NDEBUG */
#include "hash_table.hpp"
#include "hashing_algorithms.hpp"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <chrono>
#include <iostream>
#include <bitset>
#include <fstream>
#include <ctype.h>
#include <vector>
#include <assert.h>
using namespace std;
const int max_string_length = 256;
const int D = (max_string_length+7) >> 3; // D = ceil(max_string_length/size(char))
My_string * read_word(const char * book, size_t * reading_progress);
int main() {
const clock_t clock_before_seed = clock();
// load seed generated by random.org
string filename = "random_org_16_bit_numbers.txt";
std::ifstream seed_file(filename);
uint16_t seed_part;
uint16_t seed_parts[D << 2];
size_t count = 0;
while (seed_file >> seed_part) {
seed_parts[count] = seed_part;
count++;
if (count == D << 2)
break; // the seed is large enough for the max_string_length
}
if (count != D << 2) {
cout << "The current seed is not large enough. Please extend it by appending the numbers at https://www.random.org/integers/?num=10000&min=0&max=65535&col=1&base=10&format=plain&rnd=new." << endl;
return EXIT_SUCCESS;
}
// Get 64 bit seed from 16 bit seed
uint64_t seed[D];
memcpy(seed, seed_parts, D*sizeof(seed[0]));
const clock_t clock_before_hash_table = clock();
size_t l = 1; // initially, hash to a table of up to 1024 distinct words (2^10)
Hash_table ht = Hash_table(new Hash_function(seed, multiply_shift_string, l, D));
const clock_t clock_before_loading_book = clock();
// choose a book
/* std::ifstream ifs("genji_monogatari_english.txt"); */
/* std::ifstream ifs("Child_of_Light.txt"); */
/* std::ifstream ifs("the_adventures_of_sherlock_holmes.txt"); */
/* std::ifstream ifs("dracula.txt"); */
std::ifstream ifs("the_complete_works_of_william_shakespeare.txt");
string book_string( (std::istreambuf_iterator<char>(ifs) ),
(std::istreambuf_iterator<char>() ) );
const char * book = book_string.c_str();
const size_t book_length = book_string.size();
const clock_t clock_before_reading = clock();
std::vector<My_string *> words;
size_t reading_progress = 0;
while (reading_progress < book_length-1) { // book_length includes '\0' which reading_progress avoids
My_string * word = read_word(book, &reading_progress);
if (word->size > 0)
words.push_back(word);
}
const clock_t clock_after_reading = clock();
size_t count_words = 0;
for (My_string * word : words) {
count_words++;
ht.hash(word);
if (ht.is_time_for_rehash()) {
l++; // double the universe which we hashes to
ht.rehash(new Hash_function(seed, multiply_shift_string, l, D));
}
}
const clock_t clock_after_hashing = clock();
Hash_function hf = Hash_function(seed, multiply_shift_string, 10, D);
const clock_t clock_after_init_hash_function = clock();
uint64_t hashed_value = 0;
for (My_string * word : words) {
/* hf.hash(word); */
hashed_value += hf.hash(word);
}
const clock_t clock_after_hashing_only = clock();
cout << "Sum of the hashed values (after overflow): " << hashed_value << endl;
cout << "Nr of words: " << count_words << endl;
cout << "Distinct words: " << ht.get_distict_words() << endl;
cout << "Time: Load seed: " << float( clock_before_hash_table - clock_before_seed ) / CLOCKS_PER_SEC << endl;
cout << "Time: Init hash table: " << float( clock_before_loading_book - clock_before_hash_table ) / CLOCKS_PER_SEC << endl;
cout << "Time: Load book: " << float( clock_before_reading - clock_before_loading_book ) / CLOCKS_PER_SEC << endl;
cout << "Time: Read: " << float( clock_after_reading - clock_before_reading ) / CLOCKS_PER_SEC << endl;
cout << "Time: Hash & Table: " << float( clock_after_hashing - clock_after_reading ) / CLOCKS_PER_SEC << endl;
cout << "Time: Total: " << float( clock_after_hashing - clock_before_seed ) / CLOCKS_PER_SEC << endl;
cout << "Time: Init hash function: " << float( clock_after_init_hash_function - clock_after_hashing ) / CLOCKS_PER_SEC << endl;
cout << "Time: Hash, no table: " << float( clock_after_hashing_only - clock_after_init_hash_function ) / CLOCKS_PER_SEC << endl;
for (My_string * word : words)
delete word;
// BONUS: Polynomium hashing
uint32_t result_un;
size_t book_as_integer_length = (book_length+3)>>2; // ceil(book_length/4)
uint32_t book_as_integer[book_as_integer_length];
book_as_integer[book_as_integer_length-1] = 0;
memcpy(book_as_integer, book, book_length*sizeof(book[0]));
uint32_t a[3] = {12313,2212312,123332};
uint32_t b[3] = {5345,213213,123123};
uint32_t c[3] = {3231,213144,450022};
const clock_t begin_time_pol = clock();
result_un = polynomial_vector(book_as_integer, a, b, c, book_as_integer_length >> 2, 32);
const clock_t end_time_pol = clock();
cout << "polynomium hash function time: " << float( end_time_pol - begin_time_pol ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result_un << endl;
const clock_t begin_time_pol_tun = clock();
result_un = polynomial_vector_tuned(book_as_integer, a, b, c, book_as_integer_length >> 2, 32, seed);
const clock_t end_time_pol_tun = clock();
cout << "tuned polynomium hash function time: " << float( end_time_pol_tun - begin_time_pol_tun ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result_un << endl;
return EXIT_SUCCESS;
}
My_string * read_word(const char * book, size_t * reading_progress) {
bool word_started = false;
char word[max_string_length];
size_t word_length = 0;
char c;
while (book[*reading_progress+1] != '\0') {
(*reading_progress)++;
c = book[*reading_progress];
if (word_started) {
if (isalnum(c)) {
if (word_length != max_string_length) { // crop words longer than the max_string_length
word[word_length] = tolower(c);
word_length++;
}
} else {
return new My_string(word, word_length);
}
} else if (isalnum(c)) {
word_started = true;
word[word_length] = c;
word_length++;
}
}
return new My_string(word, word_length);
}

48
fagprojekt_code/Makefile Normal file
View File

@ -0,0 +1,48 @@
CC = cc # C compiler
# cc and gcc is the same
CXX = c++ # C++ compiler
# c++
# g++
# clang++
CPPFLAGS = # preprocessor flags
CFLAGS = -Wall -Wfloat-equal -std=c99 -O0 -mtune=native -funroll-loops $(OPT) # C compiler flags
# -Wall: Warnings all
# -Wfloat-equal: Warn if comparing floats with ==
# -OX: Optimization level -O0 (none, default), -O1, -O2, -O3 (most, may increase file size), -Os (decrease file size), -ffast-math
# -mtune: Tune for specific/generic CPU
# -funrool-loops: Unroll loops when compiling
# -fopenmp: Enable OpenMP (When running, set the environmental variable OMP_NUM_THREADS)
# $(OPT): Passes any parameter given when calling make with 'make OPT=...'
CXXFLAGS = -Wall -std=c++17 -O0 # C++ compiler flags
LDFLAGS = # linker flags
LDLIBS = # library flags
# -lm (for <math.h>)
# -lblas (BASIC LINEAR ALGEBRA SUBPROGRAMS)
# -lcblas (for <cblas.h>)
# -lopenblas (NOT INSTALLED; blas and cblas in one; conflicts with blas and cblas)
# -llapack (LAPACKE)
# -llapacke (LAPACK) NOTE: The order of "-llapacke -llapack -lblas" is very important
#LINK.o = $(CXX) $(LDFLAGS) # use CXX for linking
# What to do when making ex1.c
#ex1: datasize1.o
# What to do when making ex1.cpp (C++ style)
#ex1: datasize1.cpp
ex53: hashed_string.cpp hash_table.cpp hashing_algorithms.cpp
eq17: hashed_string.cpp hash_table.cpp hashing_algorithms.cpp
# Tell the compiler that 'clean' isn't referring to a file
#.PHONY: clean
# A make target that cleans (by deleting files)
clean:
$(RM) fast_modulo *.o
$(RM) eq17 *.o
$(RM) ex53 *.o
$(RM) hashed_string *.o
$(RM) hash_table *.o
fast_modulo:

View File

@ -0,0 +1,57 @@
#include <stdlib.h>
/* #include <chrono> */
#include <iostream>
/* #include <stdio.h> */
/* #include <cmath> */
/* #include <string.h> */
/* #include <stdint.h> */
#include <bitset>
#include <fstream>
using namespace std;
int main() {
string s = "test";
const char * c = s.c_str();
for (size_t i = 0; i < s.size(); i++) {
cout << std::bitset<__CHAR_BIT__>( c[i] ) << " ";
}
cout << endl;
std::ifstream ifs("genji_monogatari_english.txt");
string genji( (std::istreambuf_iterator<char>(ifs) ),
(std::istreambuf_iterator<char>() ) );
const char * c2 = genji.c_str();
for (size_t i = 0; i < 1000; i++) {
/* cout << std::bitset<__CHAR_BIT__>( c2[i] ) << " "; */
cout << c2[i];
}
cout << endl;
/* cout << genji.size() << endl; */
/* unsigned char bitgenji[genji.size()]; */
/* for (size_t i = 0; i < genji.size(); ++i) */
/* { */
/* bitgenji[i] = static_cast<unsigned char>( bitset<8>(genji[i]).to_ulong() ); */
/* } */
/* for (size_t i = 0; i < 4; ++i) */
/* { */
/* cout << (bitset<8>) bitgenji[i] << endl; */
/* cout << (unsigned int) bitgenji[i] << endl; */
/* } */
/* for (size_t i = 0; i < 100; ++i) */
/* { */
/* cout << bitgenji[i]; */
/* } */
/* cout << endl; */
return EXIT_SUCCESS;
}

138
fagprojekt_code/eq17.cpp Normal file
View File

@ -0,0 +1,138 @@
#include "hashing_algorithms.hpp"
#include <stdlib.h>
#include <chrono>
#include <iostream>
/* #include <stdio.h> */
#include <cmath>
/* #include <string.h> */
/* #include <stdint.h> */
using namespace std;
int main() {
unsigned int M = 9299983;
unsigned int m = 3226097;
unsigned int result;
const clock_t begin_time1 = clock();
for (unsigned int y = 0; y < M; y++) {
result = (unsigned int) ((float)(y*m)/(float)M);
}
const clock_t end_time1 = clock();
cout << "Naive time: " << float( end_time1 - begin_time1 ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result << endl;
const clock_t begin_time2 = clock();
for (unsigned int y = 0; y < M; y += 10) {
result = (unsigned int) ((float)(y*m)/(float)M);
result = (unsigned int) ((float)((y+1)*m)/(float)M);
result = (unsigned int) ((float)((y+2)*m)/(float)M);
result = (unsigned int) ((float)((y+3)*m)/(float)M);
result = (unsigned int) ((float)((y+4)*m)/(float)M);
result = (unsigned int) ((float)((y+5)*m)/(float)M);
result = (unsigned int) ((float)((y+6)*m)/(float)M);
result = (unsigned int) ((float)((y+7)*m)/(float)M);
result = (unsigned int) ((float)((y+8)*m)/(float)M);
result = (unsigned int) ((float)((y+9)*m)/(float)M);
}
const clock_t end_time2 = clock();
cout << "Naive time (unrolled): " << float( end_time2 - begin_time2 ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result << endl;
const clock_t begin_time6 = clock();
for (unsigned int y = 0; y < M; y++) {
result = y*m/M;
}
const clock_t end_time6 = clock();
cout << "Naive time (no-typecasting): " << float( end_time6 - begin_time6 ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result << endl;
const clock_t begin_time7 = clock();
for (unsigned int y = 0; y < M; y += 10) {
result = y*m/M;
result = ((y+1)*m)/M;
result = ((y+2)*m)/M;
result = ((y+3)*m)/M;
result = ((y+4)*m)/M;
result = ((y+5)*m)/M;
result = ((y+6)*m)/M;
result = ((y+7)*m)/M;
result = ((y+8)*m)/M;
result = ((y+9)*m)/M;
}
const clock_t end_time7 = clock();
cout << "Naive time (no-typecasting, unrolled): " << float( end_time7 - begin_time7 ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result << endl;
float mM = (float) m/(float) M;
const clock_t begin_time3 = clock();
for (unsigned int y = 0; y < M; y ++) {
result = (unsigned int) (((float) y)*mM);
}
const clock_t end_time3 = clock();
cout << "Precalculation time: " << float( end_time3 - begin_time3 ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result << endl;
const clock_t begin_time4 = clock();
for (unsigned int y = 0; y < M; y += 10) {
result = (unsigned int) ((float) y*mM);
result = (unsigned int) ((float) (y+1)*mM);
result = (unsigned int) ((float) (y+2)*mM);
result = (unsigned int) ((float) (y+3)*mM);
result = (unsigned int) ((float) (y+4)*mM);
result = (unsigned int) ((float) (y+5)*mM);
result = (unsigned int) ((float) (y+6)*mM);
result = (unsigned int) ((float) (y+7)*mM);
result = (unsigned int) ((float) (y+8)*mM);
result = (unsigned int) ((float) (y+9)*mM);
}
const clock_t end_time4 = clock();
cout << "Precalculation time (unrolled): " << float( end_time4 - begin_time4 ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result << endl;
/* for (unsigned int y = 0; y < M; y++){ */
/* // Very often, differentiating by multiplying first or dividing first results in the result being off by 1 from each other */
/* /1* if (abs((int) ((float)y*((float)m/(float)M)) - (int) ((float) y*mM)) >= 1) *1/ */
/* /1* cout <<(unsigned int) ((float)y*(float)m/(float)M) << " != " << (unsigned int) ((float) y*mM) << endl; *1/ */
/* /1* if (abs((int)(y*m/M - (y*m)/M)) >= 1) *1/ */
/* /1* cout <<y*m/M << " != " << (y*m)/M << endl; *1/ */
/* /1* if (abs((int)(((float)y*((float)m/(float)M)) - (y*m)/M)) >= 1) *1/ */
/* /1* cout <<(unsigned int)((float)y*((float)m/(float)M)) << " != " << (y*m)/M << endl; *1/ */
/* if (abs((int)(((float)y*(float)m)/(float)M - (unsigned int)y*(unsigned int)m/(unsigned int)M)) >= 1) */
/* cout <<((float)(y*m)/(float)M << " != " << (unsigned int)y*(unsigned int)m/(unsigned int)M << endl; */
/* } */
// The non-precalculation model is susceptible to overflow
/* for (unsigned int y = 0; y < 40; y++) { */
/* cout << "y: " << y << ", "; */
/* cout << "m: " << m << ", "; */
/* cout << "M: " << M << ", "; */
/* cout << "mM: " << mM << ", "; */
/* cout << "y*m: " << y*m << ", "; */
/* cout << "(y*m)/M: " << (y*m)/M << ", "; */
/* cout << "y*(m/M): " << y*(m/M) << ", "; */
/* cout << y*m/M; */
/* cout << " != "; */
/* cout << (unsigned int) ((float)(y*m)/(float)M); */
/* cout << " != "; */
/* cout << (unsigned int) ((float)y*(float)m/(float)M); */
/* cout << " != "; */
/* cout << (unsigned int) ((float) y*mM); */
/* cout << endl; */
/* } */
uint64_t result_un;
uint64_t p = pow(2, 61)-1;
uint64_t m_un = pow(2, 20)+2;
const clock_t begin_time_un = clock();
for (uint64_t y = 0; y < 100000000; y++) {
/* result_un = multiply_mod_prime_mersenne_overflow(y, m, M, p, m_un); */
result_un = multiply_shift_strongly_universal(y, p, m_un, 20);
}
const clock_t end_time_un = clock();
cout << "Unrelated hash function time: " << float( end_time_un - begin_time_un ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result_un << endl;
return EXIT_SUCCESS;
}

175
fagprojekt_code/ex53.cpp Normal file
View File

@ -0,0 +1,175 @@
/* #define NDEBUG */
#include "hash_table.hpp"
#include "hashing_algorithms.hpp"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <chrono>
#include <iostream>
#include <bitset>
#include <fstream>
#include <ctype.h>
#include <vector>
#include <assert.h>
using namespace std;
const int max_string_length = 256;
const int D = (max_string_length+7) >> 3; // D = ceil(max_string_length/size(char))
My_string * read_word(const char * book, size_t * reading_progress);
int main() {
const clock_t clock_before_seed = clock();
// load seed generated by random.org
string filename = "random_org_16_bit_numbers.txt";
std::ifstream seed_file(filename);
uint16_t seed_part;
uint16_t seed_parts[D << 2];
size_t count = 0;
while (seed_file >> seed_part) {
seed_parts[count] = seed_part;
count++;
if (count == D << 2)
break; // the seed is large enough for the max_string_length
}
if (count != D << 2) {
cout << "The current seed is not large enough. Please extend it by appending the numbers at https://www.random.org/integers/?num=10000&min=0&max=65535&col=1&base=10&format=plain&rnd=new." << endl;
return EXIT_SUCCESS;
}
// Get 64 bit seed from 16 bit seed
uint64_t seed[D];
memcpy(seed, seed_parts, D*sizeof(seed[0]));
const clock_t clock_before_hash_table = clock();
size_t l = 1; // initially, hash to a table of up to 1024 distinct words (2^10)
Hash_table ht = Hash_table(new Hash_function(seed, multiply_shift_string, l, D));
const clock_t clock_before_loading_book = clock();
// choose a book
/* std::ifstream ifs("genji_monogatari_english.txt"); */
/* std::ifstream ifs("Child_of_Light.txt"); */
/* std::ifstream ifs("the_adventures_of_sherlock_holmes.txt"); */
/* std::ifstream ifs("dracula.txt"); */
std::ifstream ifs("the_complete_works_of_william_shakespeare.txt");
string book_string( (std::istreambuf_iterator<char>(ifs) ),
(std::istreambuf_iterator<char>() ) );
const char * book = book_string.c_str();
const size_t book_length = book_string.size();
const clock_t clock_before_reading = clock();
std::vector<My_string *> words;
size_t reading_progress = 0;
while (reading_progress < book_length-1) { // book_length includes '\0' which reading_progress avoids
My_string * word = read_word(book, &reading_progress);
if (word->size > 0)
words.push_back(word);
}
const clock_t clock_after_reading = clock();
size_t count_words = 0;
for (My_string * word : words) {
count_words++;
ht.hash(word);
if (ht.is_time_for_rehash()) {
l++; // double the universe which we hashes to
ht.rehash(new Hash_function(seed, multiply_shift_string, l, D));
}
}
const clock_t clock_after_hashing = clock();
Hash_function hf = Hash_function(seed, multiply_shift_string, 10, D);
const clock_t clock_after_init_hash_function = clock();
uint64_t hashed_value = 0;
for (My_string * word : words) {
/* hf.hash(word); */
hashed_value += hf.hash(word);
}
const clock_t clock_after_hashing_only = clock();
cout << "Sum of the hashed values (after overflow): " << hashed_value << endl;
cout << "Nr of words: " << count_words << endl;
cout << "Distinct words: " << ht.get_distict_words() << endl;
cout << "Time: Load seed: " << float( clock_before_hash_table - clock_before_seed ) / CLOCKS_PER_SEC << endl;
cout << "Time: Init hash table: " << float( clock_before_loading_book - clock_before_hash_table ) / CLOCKS_PER_SEC << endl;
cout << "Time: Load book: " << float( clock_before_reading - clock_before_loading_book ) / CLOCKS_PER_SEC << endl;
cout << "Time: Read: " << float( clock_after_reading - clock_before_reading ) / CLOCKS_PER_SEC << endl;
cout << "Time: Hash & Table: " << float( clock_after_hashing - clock_after_reading ) / CLOCKS_PER_SEC << endl;
cout << "Time: Total: " << float( clock_after_hashing - clock_before_seed ) / CLOCKS_PER_SEC << endl;
cout << "Time: Init hash function: " << float( clock_after_init_hash_function - clock_after_hashing ) / CLOCKS_PER_SEC << endl;
cout << "Time: Hash, no table: " << float( clock_after_hashing_only - clock_after_init_hash_function ) / CLOCKS_PER_SEC << endl;
for (My_string * word : words)
delete word;
// BONUS: Polynomium hashing
uint32_t result_un;
size_t book_as_integer_length = (book_length+3)>>2; // ceil(book_length/4)
uint32_t book_as_integer[book_as_integer_length];
book_as_integer[book_as_integer_length-1] = 0;
memcpy(book_as_integer, book, book_length*sizeof(book[0]));
uint32_t a[3] = {12313,2212312,123332};
uint32_t b[3] = {5345,213213,123123};
uint32_t c[3] = {3231,213144,450022};
const clock_t begin_time_pol = clock();
result_un = polynomial_vector(book_as_integer, a, b, c, book_as_integer_length >> 2, 32);
const clock_t end_time_pol = clock();
cout << "polynomium hash function time: " << float( end_time_pol - begin_time_pol ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result_un << endl;
const clock_t begin_time_pol_tun = clock();
result_un = polynomial_vector_tuned(book_as_integer, a, b, c, book_as_integer_length >> 2, 32, seed);
const clock_t end_time_pol_tun = clock();
cout << "tuned polynomium hash function time: " << float( end_time_pol_tun - begin_time_pol_tun ) / CLOCKS_PER_SEC << endl;
cout << "Result: " << result_un << endl;
return EXIT_SUCCESS;
}
My_string * read_word(const char * book, size_t * reading_progress) {
bool word_started = false;
char word[max_string_length];
size_t word_length = 0;
char c;
while (book[*reading_progress+1] != '\0') {
(*reading_progress)++;
c = book[*reading_progress];
if (word_started) {
if (isalnum(c)) {
if (word_length != max_string_length) { // crop words longer than the max_string_length
word[word_length] = tolower(c);
word_length++;
}
} else {
return new My_string(word, word_length);
}
} else if (isalnum(c)) {
word_started = true;
word[word_length] = c;
word_length++;
}
}
return new My_string(word, word_length);
}

View File

@ -0,0 +1,202 @@
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdint.h>
void printIntInBinary(unsigned int n);
void intToBinary(unsigned int n, char* p);
void strrev(char* p, unsigned int len);
unsigned int mod32BitMersenne(unsigned int x, unsigned int m);
int main() {
unsigned int q = 2;
unsigned int x = pow(2, 2*q) - 1; // pow(2, 2*q) is the max value for x
unsigned int p = pow(2, q) - 1;
unsigned int y;
y=(x&p)+(x>>q);
if (y>=p) y-=p;
printf("q = %d, x = %d, p = %d, y = %d\n", q, x, p, y);
printf("x&p = %d, x>>q = %d\n", x&p, x>>q);
printIntInBinary(x);
printIntInBinary(p);
printIntInBinary(x&p);
printIntInBinary(x>>q);
printf("\n");
printIntInBinary( ((unsigned int)0b1111101111011101101<<26)>>26 );
printIntInBinary( ((unsigned int)0b1111101111011101101<<27)>>27 );
printIntInBinary( ((unsigned int)0B1111101111011101101<<28)>>28 );
printIntInBinary( ((unsigned int)0B1111101111011101101<<29)>>29 );
printIntInBinary( ((unsigned int)0B1111101111011101101<<30)>>30 );
unsigned int a = 0b0110000000;
unsigned int b = 0b1111111001;
printf("\n");
printIntInBinary(a);
printIntInBinary(b);
b = (b & ~0b110) | ((a & 0b110000000) >> 6);
printIntInBinary(a);
printIntInBinary(b);
/* // Too general an approach */
/* // TODO: Special case where ax+b < p < m */
/* // ((ax+b) mod p) mod m, p=2^89-1, m=2^20 */
/* size_t r = 89; // aka q */
/* size_t bits = 32; */
/* size_t remainingBits = r; */
/* size_t lastIndex = 0; */
/* size_t len = 4; */
/* uint32_t data[len], tmp[len]; */
/* // mod p, p = 2^89-1 */
/* // x&p */
/* for (size_t i = 0; (i+1)*bits < r; i++ ) { */
/* tmp[i] = data[i]&0b1111'1111'1111'1111'1111'1111'1111'1111; // 32 ones. Standard since C++14 */
/* remainingBits -= 32; */
/* lastIndex = i+1; */
/* } */
/* size_t shiftLen = bits-remainingBits; // TODO: Check for 1-off errors */
/* tmp[lastIndex] = (data[lastIndex]<<shiftLen)>>shiftLen; // mod 2^remainingBits */
/* // x>>q */
/* /1* for (size_t i = 0; i < len; i++) { *1/ */
/* /1* } *1/ */
/* // mod m, m = 2^20 */
/* unsigned int mMinusOne = 0b1111'1111'1111'1111'1111; // 20 ones. Standard since C++14 */
/* data[len-1] = data[len-1]&mMinusOne; */
/* // Can be optimized further */
/* size_t len = 4; // input parameter */
/* uint32_t data[len], tmp[len]; */
/* //// mod p, p = 2^89-1 */
/* // x&p */
/* tmp[len] = data[len]&0b1111'1111'1111'1111'1111'1111'1111'1111; // 32 ones. */
/* tmp[len] = data[len]&0b1111'1111'1111'1111'1111'1111'1111'1111; // 32 ones. */
/* tmp[len] = data[len]&0b1'1111'1111'1111'1111'1111'1111; // remaining bits = 89-32*2 = 25 ones. */
/* // x>>q */
/* //// mod m, m = 2^20 */
/* unsigned int mMinusOne = 0b1111'1111'1111'1111'1111; // 20 ones. */
/* data[len-1] = data[len-1]&mMinusOne; */
/* // Optimize more! */
/* size_t len = 4; // input parameter */
/* uint32_t data[len]; // input parameter */
/* uint32_t tmp; */
/* //// mod p, p = 2^89-1 */
/* // x&p */
/* tmp = data[len-1]&0b1111'1111'1111'1111'1111; // 20 ones */
/* // x>>q */
/* // 89 - 2*32 = 25 so */
/* tmp = tmp + (data[len]>>5)&0b1111'1111'1111'1111'1111; // 20 ones */
/* //// mod m, m = 2^20 */
/* tmp = tmp&0b1111'1111'1111'1111'1111; // 20 ones = m - 1 */
/* // clean up before output */
/* for (size_t i = 0; i < len; i++) */
/* data[i] = 0; */
/* // output */
/* data[len-1] = tmp; */
/* // More!!! */
/* size_t len = 4; // input parameter */
/* uint32_t data[len]; // input parameter */
/* //// mod p, p = 2^89-1 */
/* // x&p */
/* data[len-1] = data[len-1]&0b1111'1111'1111'1111'1111; // 20 ones */
/* // x>>q */
/* // 89 - 2*32 = 25 so */
/* data[len-1] = data[len-1] + ((data[len-3]>>5)&0b1111'1111'1111'1111'1111); // 20 ones */
/* //// mod m, m = 2^20 */
/* data[len-1] = data[len-1]&0b1111'1111'1111'1111'1111; // 20 ones = m - 1 */
/* // data[len-1] is the result */
size_t len = 4; // input parameter
uint32_t data[len]; // input parameter
data[len-1] = (data[len-1] + (data[len-3]>>5))&0b1111'1111'1111'1111'1111; // 20 ones
// data[len-1] is the result
// Or as a function
// void mod89mod20 (uint32_t * data, size_t len) {
// data[len-1] = (data[len-1] + (data[len-3]>>5))&0b1111'1111'1111'1111'1111; // 20 ones
// }
return EXIT_SUCCESS;
}
template <typename T>
T mod (T x, T p) {
while (x >= p)
x -= p;
return x;
}
void printIntInBinary(unsigned int n) {
char b[100];
intToBinary(n, b);
printf("%5u = %10s\n", n, b);
}
void intToBinary(unsigned int n, char* p) {
if (NULL == p)
return;
char* q = p;
int count = 0;
do {
if (n % 2 == 0)
*p = '0';
else
*p = '1';
p++;
n = n>>1;
count++;
} while (n > 0);
*p = '\0';
strrev(q, count);
}
void strrev(char* p, unsigned int len) {
char tmp;
int i, j = 0;
i = 0;
j = len - 1;
while (i < j) {
tmp = p[i];
p[i] = p[j];
p[j] = tmp;
i++;
j--;
}
}

View File

@ -0,0 +1,56 @@
#include "hash_table.hpp"
#include "hashed_string.hpp"
#include <stdint.h>
#include <string.h>
Hash_table::Hash_table(Hash_function * hash_function) : hash_function(hash_function) {
m = hash_function->get_m();
words = new Hashed_string[m];
}
Hash_table::~Hash_table() {
delete hash_function;
delete[] words;
}
size_t Hash_table::get_distict_words() {
return distinct_words;
}
void Hash_table::hash(My_string * word) {
uint64_t hash = hash_function->hash(word);
distinct_words += words[hash].append(new Hashed_string(word));
}
void Hash_table::hash(Hashed_string * hs) {
uint64_t hash = hash_function->hash( &(hs->string) );
distinct_words += words[hash].append(hs);
}
void Hash_table::rehash(Hash_function * hash_function) {
delete this->hash_function;
this->hash_function = hash_function;
size_t old_m = m;
m = this->hash_function->get_m();
Hashed_string * old_words = this->words;
words = new Hashed_string[m];
distinct_words = 0; // reset, because we are hashing all the words once more
for (size_t i = 0; i < old_m; i++) {
Hashed_string * hs = &(old_words[i]);
// skip the initializer Hashed_string
while (hs->next != NULL) {
hs = hs->next;
hash(new Hashed_string(hs));
}
}
// delete the old stuff
delete[] old_words;
}

View File

@ -0,0 +1,56 @@
#ifndef HASH_TABLE_H
#define HASH_TABLE_H
#include "hashed_string.hpp"
#include <stdint.h>
struct Hash_function {
private:
uint64_t * seed;
uint64_t(* hash_function)(const My_string *, const uint64_t *, uint64_t *, size_t);
size_t l;
uint64_t * x;
public:
Hash_function(uint64_t * seed, uint64_t(* hash_function)(const My_string *, const uint64_t *, uint64_t *, size_t), size_t l, size_t D) {
this->seed = seed;
this->hash_function = hash_function;
this->l = l;
this->x = new uint64_t[D];
}
~Hash_function() {
delete[] x;
}
size_t get_m() {
return 1 << l;
}
uint64_t hash(const My_string * string) {
return hash_function(string, seed, x, l);
}
};
struct Hash_table {
private:
Hash_function * hash_function;
Hashed_string * words;
size_t distinct_words = 0;
size_t m;
public:
Hash_table(Hash_function * hash_function);
~Hash_table();
size_t get_distict_words();
void hash(My_string * word);
void hash(Hashed_string * hs);
void rehash(Hash_function * hash_function);
int is_time_for_rehash() {return distinct_words >= m;}
};
#endif

View File

@ -0,0 +1,78 @@
#include "hashed_string.hpp"
#include <stdlib.h>
#include <stdint.h>
#include <string>
#include <string.h>
#include <iostream>
My_string::My_string(char * chars, size_t size) {
this->size = size;
this->chars = new char[size];
if (size != 0) {
memcpy(this->chars, chars, size*sizeof(this->chars[0]));
}
}
My_string::~My_string() {
delete[] chars;
}
bool My_string::operator==(const My_string& other)
{
if (size != other.size)
return false;
return strncmp(chars, other.chars, size) == 0;
}
Hashed_string::Hashed_string() : string(My_string(NULL, 0)) {
initializer = true;
}
Hashed_string::Hashed_string(My_string * s) : string(My_string(s->chars, s->size)) {};
Hashed_string::Hashed_string(Hashed_string * hs) : string(My_string(hs->string.chars, hs->string.size)), appearances(hs->appearances) {};
Hashed_string::~Hashed_string() {
// if you delete the initializer, delete the whole chain
if (initializer) {
while (next != NULL)
next->remove(next);
}
}
int Hashed_string::append(Hashed_string * new_string) {
if (!initializer) {
if (string == new_string->string) {
appearances += new_string->appearances;
return 0;
}
}
if (last_element) {
new_string->previous = this;
next = new_string;
last_element = false;
return 1;
} else {
return next->append(new_string);
}
}
void Hashed_string::remove(Hashed_string * removed_string) {
if (initializer && last_element)
return;
if (string == removed_string->string) {
if (last_element) {
previous->last_element = true;
previous->next = NULL;
} else {
previous->next = next;
next->previous = previous;
}
delete this;
} else if (!last_element)
next->remove(removed_string);
}

View File

@ -0,0 +1,34 @@
#ifndef HASHED_OBJECT_H
#define HASHED_OBJECT_H
#include <string>
struct My_string {
char * chars;
size_t size;
My_string(char *, size_t);
~My_string();
bool operator==(const My_string& other);
};
struct Hashed_string {
Hashed_string * next = NULL;
Hashed_string * previous = NULL;
bool last_element = true;
bool initializer = false; // signal the object to act as the "link" between the its reference and the linked list of hashed elemenets. This instance DOES NOT represent a hashed value.
My_string string;
unsigned int appearances = 1;
Hashed_string();
Hashed_string(My_string * s);
Hashed_string(Hashed_string * hs);
~Hashed_string();
int append(Hashed_string * new_string); // returns 1 if the word was new and 0 otherwise.
void remove(Hashed_string * removed_string);
};
#endif

View File

@ -0,0 +1,294 @@
#include "hashing_algorithms.hpp"
#include "hashed_string.hpp"
#include <string.h>
// p is prime. p > m > 1, p > a > 0, p > b >= 0
uint64_t multiply_mod_prime(uint64_t x, uint64_t a, uint64_t b, uint64_t p, uint64_t m) {
return ((a*x+b) % p) % m;
}
// p is a Mersenne prime. p > m > 1, p > a > 0, p > b >= 0
uint64_t multiply_mod_prime_mersenne(uint64_t x, uint64_t a, uint64_t b, uint64_t p, uint64_t m) {
uint64_t y = a*x+b;
y = (y&p)+(y>>p);
if (y>=p) y-=p;
return y % m;
}
// p is a Mersenne prime. m=2^q. p > m > 1, p > a > 0, p > b >= 0
uint64_t multiply_mod_prime_mersenne_overflow(uint64_t x, uint64_t a, uint64_t b, uint64_t p, char q) {
uint64_t y = a*x+b;
y = (y&p)+(y>>p);
if (y>=p) y-=p;
return y & ~( ( ~(uint64_t)0 ) << q); // OBS: Behaviour undefined for shifting n-bit integers n times
}
// p is a Mersenne prime. m=2^q. p > m > 1, p > a > 0
uint64_t multiply_mod_prime_mersenne_overflow_no_b(uint64_t x, uint64_t a, uint64_t p, char q) {
uint64_t y = a*x;
y = (y&p)+(y>>p);
if (y>=p) y-=p;
return y & ~( ( ~(uint64_t)0 ) << q); // OBS: Behaviour undefined for shifting n-bit integers n times
}
// p=2^89-1 is a Mersenne prime. m=2^l. 32 >= l > 0. p > a > 0, p > b >= 0. x is an 64 bit integer. We assume x, a, and b are arrays of 32 bit integers.
uint32_t multiply_mod_prime_mersenne_overflow_high_bitcount(uint32_t * x, uint32_t * a, uint32_t * b, char l) {
// x is a 64 bit integer given as a 2 long array of 32 bit integers
// a is a 89 bit integer given as a 3 long array of 32 bit integers
// b is a 89 bit integer given as a 3 long array of 32 bit integers
// ax: array to hold the sub-calculations of a*x
// there are 2*3 sub-calculations with each result being split into the least significant 32 bits and the most significant 32 bits
uint32_t ax[12]; // 12 = 2*3*2
for (size_t i = 0; i < 2; i++) { // index x
for (size_t ii = 0; ii < 3; ii++) { // index a
uint64_t tmp = (uint64_t)x[i] * (uint64_t)a[ii];
ax[6*i+2*ii] = (uint32_t)(tmp & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
ax[6*i+2*ii+1] = (uint32_t)(tmp >> 32);
}
}
// calculate y = ax+b
uint32_t * y = new uint32_t[5];
uint64_t sum = 0;
uint32_t carry = 0;
// calculate bits 0-32
sum = (uint64_t)ax[10]+(uint64_t)b[2];
carry = (uint32_t)(sum >> 32);
y[4] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
// calculate bits 33-64
sum = (uint64_t)carry + (uint64_t)ax[11] + (uint64_t)ax[8] + (uint64_t)ax[4] + (uint64_t)b[1];
carry = (uint32_t)(sum >> 32);
y[3] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
// calculate bits 65-96
sum = (uint64_t)carry + (uint64_t)ax[9] + (uint64_t)ax[5] + (uint64_t)ax[2] + (uint64_t)ax[6] + (uint64_t)b[0];
carry = (uint32_t)(sum >> 32);
y[2] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
// calculate bits 97-128
sum = (uint64_t)carry + (uint64_t)ax[3] + (uint64_t)ax[7] + (uint64_t)ax[0];
carry = (uint32_t)(sum >> 32);
y[1] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
// calculate bits 129-160
y[0] = ax[1];
//// calculate modulo p for p=2^89-1
// y&p
// we take the 89 first bits
uint32_t yandp[3];
yandp[2] = y[4];
yandp[1] = y[3];
yandp[0] = (y[2] & 0b11111'11111'11111'11111'11111); // 25 ones
// y>>q
// we bitshift 89 times, so we only keep the 5*32 - 89 = 71 most significant bits
uint32_t yshiftq[3];
yshiftq[2] = (y[2] >> 25) // keep 7 bits
| (y[1] << 7); // keep 25 bits
yshiftq[1] = (y[1] >> 25) | (y[0] << 7);
yshiftq[0] = y[0] >> 25;
// y = (y&p) + (y >> q)
// bits 0-32
sum = (uint64_t)yandp[2] + (uint64_t)yshiftq[2];
carry = (uint32_t)(sum >> 32);
y[4] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
// bits 33-64
sum = (uint64_t)carry + (uint64_t)yandp[1] + (uint64_t)yshiftq[1];
carry = (uint32_t)(sum >> 32);
y[3] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
// bits 65-71
y[2] = (uint64_t)carry + (uint64_t)yandp[0] + (uint64_t)yshiftq[0];
y[1] = 0;
// y[0] = 0;
//// if y >= p; y -= p
// y >= p if bits 90 to 96 are != 0 (actually if bit 90 is set, but this is prettier)
if ((y[2] >> 25) != 0) {
// subtracting 2^89-1 is equal to subtracting 2^89 and adding 1.
// - 2^89
y[2] = y[2] & 0b1'1111'1111'1111'1111'1111'1111; // 25 ones. We know that bit 91 to 96 are 0
// + 1
for (size_t i = 4; i > 1; i--) {
y[i] += 1;
if (y[i] != 0)
break;
}
}
// mod 2^l (mod m)
return (y[4] >> (32-l));
}
uint64_t multiply_shift_c_universal(uint32_t x, uint64_t a, char l) {
return (a*x) >> (64-l);
}
uint64_t multiply_shift_strongly_universal(uint32_t x, uint64_t a, uint64_t b, char l) {
return (a*x+b) >> (64-l);
}
uint64_t multiply_shift_vector(uint32_t * x, uint64_t * seed, size_t d, char l) {
uint64_t val = 0;
for (size_t i = 0; i < d; i++)
val += seed[i]*x[i];
return (val + seed[d-1]) >> (64-l);
}
// requires x to be of size D and the seed to be of size D
uint64_t multiply_shift_string(const My_string * string, const uint64_t * seed, uint64_t * x, size_t l) {
size_t d = (string->size+7) >> 3; // d = ceil(string->size/8)
x[d-1] = 0;
memcpy(x, string->chars, string->size*sizeof(string->chars[0]));
uint64_t val = 0;
for (size_t i = 0; i < d; i++)
val += (seed[2*i]+(uint32_t)(x[i]>>32))*(seed[2*i+1]+(uint32_t)x[i]);
return (val + seed[d]) >> (64-l);
}
// calculate (ax+b) mod p
// p=2^89-1 is a Mersenne prime. p > a. p > x. a and x are size 3 arrays of uint32_t.
// b is a 64 bit integer given as a size 2 array of uint32_t.
// The result is saved in x.
void high_bitcount_ax_b_mod_p(uint32_t * x, uint32_t * a, uint32_t * b) {
// x, a, and b are 89 bit integers given as a 3 long arrays of 32 bit integers.
// ax: array to hold the sub-calculations of a*x
// there are 3*3 sub-calculations with each result being split into the least significant 32 bits and the most significant 32 bits
size_t ax_size = 18; // 18 = 3*3*2
uint32_t ax[ax_size];
for (size_t i = 0; i < 3; i++) { // index x
for (size_t ii = 0; ii < 3; ii++) { // index a
uint64_t tmp = (uint64_t)x[i] * (uint64_t)a[ii];
ax[6*i+2*ii] = (uint32_t)(tmp & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
ax[6*i+2*ii+1] = (uint32_t)(tmp >> 32);
}
}
// calculate y = ax+b
size_t y_size = 6; // 6 = ceil( (89+89+1)/32 )
uint32_t * y = new uint32_t[y_size];
uint64_t sum = 0;
uint32_t carry = 0;
y[0] = ax[0]+b[0];
for (size_t index = 1; index < y_size-1; index++) {
sum = 0;
for (size_t i = 0; i <= index; i++) {
size_t ii = index - i;
sum += ax[6*i+2*ii];
}
for (size_t i = 0; i < index; i++) {
size_t ii = index - i - 1;
sum += ax[6*i+2*ii+1];
}
if (index < 2) {
sum += b[index];
}
carry = (uint32_t)(sum >> 32);
y[index] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
}
y[y_size-1] = carry+ax[ax_size-1];
//// calculate modulo p for p=2^89-1
// y&p
// we take the 89 first bits
uint32_t yandp[3];
yandp[0] = y[0];
yandp[1] = y[1];
yandp[2] = (y[2] & 0b11111'11111'11111'11111'11111); // 25 ones
// y>>q
// we bitshift 89 times, and keep the 89 following bits
uint32_t yshiftq[3];
yshiftq[0] = (y[2] >> 25) // keep 7 bits
| (y[3] << 7); // keep 25 bits
yshiftq[1] = (y[3] >> 25) | (y[4] << 7);
yshiftq[2] = (y[4] >> 25) // keep 7 bits
| (y[5] << 7); // keep 25 bits, but only 18 of them can be nonzero in practice.
// y = (y&p) + (y >> q)
// bits 0-32
sum = (uint64_t)yandp[0] + (uint64_t)yshiftq[0];
carry = (uint32_t)(sum >> 32);
y[0] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
// bits 33-64
sum = (uint64_t)carry + (uint64_t)yandp[1] + (uint64_t)yshiftq[1];
carry = (uint32_t)(sum >> 32);
y[1] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
// bits 65-71
y[2] = (uint64_t)carry + (uint64_t)yandp[2] + (uint64_t)yshiftq[2];
// y[3] = 0; // we don't use it anyway
// y[4] = 0;
// y[5] = 0;
//// if y >= p; y -= p
// y >= p if bits 90 to 96 are != 0 (actually if bit 90 is set, but this is prettier)
if ((y[2] << 25) != 0) {
// subtracting 2^89-1 is equal to subtracting 2^89 and adding 1.
// - 2^89
y[2] = y[2] & 0b1'1111'1111'1111'1111'1111'1111; // 25 ones. We know that bit 91 to 96 are 0
// + 1
for (size_t i = 0; i < 3; i++) {
y[i] += 1;
if (y[i] != 0)
break;
}
}
x[0] = y[0];
x[1] = y[1];
x[2] = y[2];
return;
}
// p=2^89-1 is a Mersenne prime.
// p > a >= 0. p > b >= 0. p > c >= 0. x is a size 2d list of 64 bit integers split into 32 bit integers.
// We assume a, b, and c are size 3 arrays of 32 bit integers.
uint32_t polynomial_vector(uint32_t * x, uint32_t * a, uint32_t * b, uint32_t * c, size_t d, char l) {
uint32_t H[3];
H[0] = x[0];
H[1] = x[1];
for (size_t i = 1; i < d; i++) {
high_bitcount_ax_b_mod_p(H, c, x+2*i);
}
high_bitcount_ax_b_mod_p(H, a, b);
return H[0] >> (32-l);
}
// p=2^89-1 is a Mersenne prime.
// p > a >= 0. p > b >= 0. p > c >= 0. x is a size 2d list of 64 bit integers split into 32 bit integers.
// We assume a, b, and c are size 3 arrays of 32 bit integers.
// we assume the seed to be of size 4 (at minimum).
uint32_t polynomial_vector_tuned(uint32_t * x, uint32_t * a, uint32_t * b, uint32_t * c, size_t d, char l, const uint64_t * seed) {
size_t x_remainder = d - ((d >> 2) << 2); // abuse integer division and multiplication (via bitshifts) as modulo
size_t x_tuned_size = (d >> 2) + 1;
uint32_t x_tuned[x_tuned_size];
// prehash chunks of x using the bounded string algorithm
uint64_t buffer_memory[4];
char word[256];
for (size_t i = 0; i <= (d >> 4) - 1; i++ ) { // d 64 bit integers -> d/4 256 char strings
memcpy(word, x + i*4, 256*sizeof(word[0]));
My_string str = My_string(word, 256);
x_tuned[i] = (uint32_t) multiply_shift_string(&str, seed, buffer_memory, 32);
}
// prehash the leftovers
if (x_remainder != 0) {
memcpy(word, x + d-x_remainder, (x_remainder << 3)*sizeof(word[0]));
My_string str = My_string(word, (x_remainder << 3));
x_tuned[x_tuned_size-1] = (uint32_t) multiply_shift_string(&str, seed, buffer_memory, 32);
}
return polynomial_vector(x_tuned, a, b, c, x_tuned_size, l);
}

View File

@ -0,0 +1,42 @@
#ifndef HASHING_ALGORITHMS_H
#define HASHING_ALGORITHMS_H
#include "hashed_string.hpp"
#include <stdint.h>
// p is prime. p > m > 1, p > a > 0, p > b >= 0
uint64_t multiply_mod_prime(uint64_t x, uint64_t a, uint64_t b, uint64_t p, uint64_t m);
// p is a Mersenne prime. p > m > 1, p > a > 0, p > b >= 0
uint64_t multiply_mod_prime_mersenne(uint64_t x, uint64_t a, uint64_t b, uint64_t p, uint64_t m);
// p is a Mersenne prime. m=2^q. p > m > 1, p > a > 0, p > b >= 0
uint64_t multiply_mod_prime_mersenne_overflow(uint64_t x, uint64_t a, uint64_t b, uint64_t p, char q);
// p is a Mersenne prime. m=2^q. p > m > 1, p > a > 0
uint64_t multiply_mod_prime_mersenne_overflow_no_b(uint64_t x, uint64_t a, uint64_t p, char q);
// p=2^89-1 is a Mersenne prime. m=2^l. 32 >= l > 0. p > a > 0, p > b >= 0. x is an 64 bit integer. We assume x, a, and b are arrays of 32 bit integers.
uint32_t multiply_mod_prime_mersenne_overflow_high_bitcount(uint32_t * x, uint32_t * a, uint32_t * b, char l);
uint64_t multiply_shift_c_universal(uint32_t x, uint64_t a, char l);
uint64_t multiply_shift_strongly_universal(uint32_t x, uint64_t a, uint64_t b, char l);
uint64_t multiply_shift_vector(uint32_t * x, uint64_t * seed, size_t d, char l);
// requires x to be of size D and the seed to be of size D
uint64_t multiply_shift_string(const My_string * string, const uint64_t * seed, uint64_t * x, size_t l);
// p=2^89-1 is a Mersenne prime.
// p > a >= 0. p > b >= 0. p > c >= 0. x is a size 2d list of 64 bit integers split into 32 bit integers.
// We assume a, b, and c are size 3 arrays of 32 bit integers.
uint32_t polynomial_vector(uint32_t * x, uint32_t * a, uint32_t * b, uint32_t * c, size_t d, char l);
// p=2^89-1 is a Mersenne prime.
// p > a >= 0. p > b >= 0. p > c >= 0. x is a size 2d list of 64 bit integers split into 32 bit integers.
// We assume a, b, and c are size 3 arrays of 32 bit integers.
// we assume the seed to be of size 4 (at minimum).
uint32_t polynomial_vector_tuned(uint32_t * x, uint32_t * a, uint32_t * b, uint32_t * c, size_t d, char l, const uint64_t * seed);
#endif

View File

@ -0,0 +1,152 @@
/* #define NDEBUG */
#include "hash_table.hpp"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <chrono>
#include <iostream>
#include <bitset>
#include <fstream>
#include <ctype.h>
#include <vector>
#include <assert.h>
using namespace std;
const int max_string_length = 256;
const int D = (max_string_length+7) >> 3; // D = ceil(max_string_length/size(char))
My_string * read_word(const char * book, size_t * reading_progress);
int main() {
return EXIT_SUCCESS;
}
void speedtest_using_hash_table(string title, uint64_t(* algorithm)(const My_string *, const uint64_t *, uint64_t *, size_t), string seed_file) {
const clock_t clock_before_seed = clock();
// load seed generated by random.org
string filename = "random_org_16_bit_numbers.txt";
std::ifstream seed_file(filename);
uint16_t seed_part;
uint16_t seed_parts[D << 2];
size_t count = 0;
while (seed_file >> seed_part) {
seed_parts[count] = seed_part;
count++;
if (count == D << 2)
break; // the seed is large enough for the max_string_length
}
if (count != D << 2) {
cout << "The current seed is not large enough. Please extend it by appending the numbers at https://www.random.org/integers/?num=10000&min=0&max=65535&col=1&base=10&format=plain&rnd=new." << endl;
return EXIT_SUCCESS;
}
// Get 64 bit seed from 16 bit seed
uint64_t seed[D];
memcpy(seed, seed_parts, D*sizeof(seed[0]));
const clock_t clock_before_hash_table = clock();
size_t l = 1; // initially, hash to a table of up to 1024 distinct words (2^10)
Hash_table ht = Hash_table(new Hash_function(seed, hash_string, l, D));
const clock_t clock_before_loading_book = clock();
// choose a book
/* std::ifstream ifs("genji_monogatari_english.txt"); */
/* std::ifstream ifs("Child_of_Light.txt"); */
/* std::ifstream ifs("the_adventures_of_sherlock_holmes.txt"); */
/* std::ifstream ifs("dracula.txt"); */
std::ifstream ifs("the_complete_works_of_william_shakespeare.txt");
string book_string( (std::istreambuf_iterator<char>(ifs) ),
(std::istreambuf_iterator<char>() ) );
const char * book = book_string.c_str();
const size_t book_length = book_string.size();
const clock_t clock_before_reading = clock();
std::vector<My_string *> words;
size_t reading_progress = 0;
while (reading_progress < book_length-1) { // book_length includes '\0' which reading_progress avoids
My_string * word = read_word(book, &reading_progress);
if (word->size > 0)
words.push_back(word);
}
const clock_t clock_after_reading = clock();
size_t count_words = 0;
for (My_string * word : words) {
count_words++;
ht.hash(word);
if (ht.is_time_for_rehash()) {
l++; // double the universe which we hashes to
ht.rehash(new Hash_function(seed, hash_string, l, D));
}
}
const clock_t clock_after_hashing = clock();
Hash_function hf = Hash_function(seed, hash_string, 10, D);
const clock_t clock_after_init_hash_function = clock();
uint64_t hashed_value = 0;
for (My_string * word : words) {
/* hf.hash(word); */
hashed_value += hf.hash(word);
}
const clock_t clock_after_hashing_only = clock();
cout << "Sum of the hashed values (after overflow): " << hashed_value << endl;
cout << "Nr of words: " << count_words << endl;
cout << "Distinct words: " << ht.get_distict_words() << endl;
cout << "Time: Load seed: " << float( clock_before_hash_table - clock_before_seed ) / CLOCKS_PER_SEC << endl;
cout << "Time: Init hash table: " << float( clock_before_loading_book - clock_before_hash_table ) / CLOCKS_PER_SEC << endl;
cout << "Time: Load book: " << float( clock_before_reading - clock_before_loading_book ) / CLOCKS_PER_SEC << endl;
cout << "Time: Read: " << float( clock_after_reading - clock_before_reading ) / CLOCKS_PER_SEC << endl;
cout << "Time: Hash & Table: " << float( clock_after_hashing - clock_after_reading ) / CLOCKS_PER_SEC << endl;
cout << "Time: Total: " << float( clock_after_hashing - clock_before_seed ) / CLOCKS_PER_SEC << endl;
cout << "Time: Init hash function: " << float( clock_after_init_hash_function - clock_after_hashing ) / CLOCKS_PER_SEC << endl;
cout << "Time: Hash, no table: " << float( clock_after_hashing_only - clock_after_init_hash_function ) / CLOCKS_PER_SEC << endl;
for (My_string * word : words)
delete word;
}
My_string * read_word(const char * book, size_t * reading_progress) {
bool word_started = false;
char word[max_string_length];
size_t word_length = 0;
char c;
while (book[*reading_progress+1] != '\0') {
(*reading_progress)++;
c = book[*reading_progress];
if (word_started) {
if (isalnum(c)) {
if (word_length != max_string_length) { // crop words longer than the max_string_length
word[word_length] = tolower(c);
word_length++;
}
} else {
return new My_string(word, word_length);
}
} else if (isalnum(c)) {
word_started = true;
word[word_length] = c;
word_length++;
}
}
return new My_string(word, word_length);
}

245
general_library.cpp Normal file
View File

@ -0,0 +1,245 @@
#include "general_library.hpp"
// https://en.wikipedia.org/wiki/Extended_Euclidean_algorithm#Modular_integers
uint32_t multiplicative_inverse(int a, int n) {
int t = 0, newt = 1, r = n, newr = a;
while (newr != 0) {
int quotient = r/newr;
int tmp = newt;
newt = t-quotient*newt;
t = tmp;
tmp = newr;
newr = r-quotient*newr;
r = tmp;
}
if (r > 1)
throw std::runtime_error("Tried to inverse a noninvertible element!");
if (t < 0)
t = t + n;
return t;
}
// https://stackoverflow.com/questions/18620942/find-the-smallest-period-of-input-string-in-on
std::vector<int> calculateLPS(const char * pat, int m) {
/* int[] lps = new int[pat.length()]; */
int len = 0;
int i = 1;
std::vector<int> lps = {0};
lps.resize(m);
while (i < m) {
if (pat[i] == pat[len]) {
len++;
lps[i] = len;
i++;
}
else {
if (len != 0) {
len = lps[len - 1];
}
else {
lps[i] = len;
i++;
}
}
}
return lps;
}
// calculates the length of the shortest period
int len_of_shortest_period (const char * pattern, int m) {
std::vector<int> lps = calculateLPS(pattern, m);
//start at the end of the string
int i = lps.size()-1;
while (lps[i] != 0) {
//shift back
i -= lps[i];
}
return i+1;
}
// SECTION: Functions related to calculating with polynomials in Z2[x]
uint32_t irreducible_polynomials[] {
0b11, // irreducible polynomial of degree 1
0b111, // irreducible polynomial of degree 2
0b1011, // irreducible polynomial of degree 3
0b10011, // irreducible polynomial of degree 4
0b100101, // irreducible polynomial of degree 5
0b1011011, // irreducible polynomial of degree 6
0b10000011, // irreducible polynomial of degree 7
0b100011101, // irreducible polynomial of degree 8
0b1000010001, // irreducible polynomial of degree 9
0b10001101111, // irreducible polynomial of degree 10
0b100000000101, // irreducible polynomial of degree 11
0b1000011101011, // irreducible polynomial of degree 12
0b10000000011011, // irreducible polynomial of degree 13
0b100000010101001, // irreducible polynomial of degree 14
0b1000000000110101, // irreducible polynomial of degree 15
0b10000000000101101, // irreducible polynomial of degree 16
0b100000000000001001, // irreducible polynomial of degree 17
0b1000001010000000011, // irreducible polynomial of degree 18
0b10000000000000100111, // irreducible polynomial of degree 19
0b100000000011011110011, // irreducible polynomial of degree 20
0b1000000000000001100101, // irreducible polynomial of degree 21
0b10000000001111101100001, // irreducible polynomial of degree 22
0b100000000000000000100001, // irreducible polynomial of degree 23
0b1000000011110011010101001, // irreducible polynomial of degree 24
0b10000000000000000101000101, // irreducible polynomial of degree 25
0b100000000000100010111010011, // irreducible polynomial of degree 26
0b1000000000000001011010101101, // irreducible polynomial of degree 27
0b10000000000000010000011100101, // irreducible polynomial of degree 28
0b100000000000000000000000000101, // irreducible polynomial of degree 29
0b1000000000000110010100010101111, // irreducible polynomial of degree 30
0b10000000000000000000000000001001 // irreducible polynomial of degree 31
};
size_t degree (uint32_t polynomial) {
return 31 - __builtin_clz(polynomial);
}
uint32_t multiply_modulo_polynomials_in_Z2 (uint32_t q1, uint32_t q2, uint32_t p) {
size_t p_degree = degree(p);
size_t q1_degree = degree(q1);
size_t q2_degree = degree(q2);
if (q1_degree > p_degree)
throw std::logic_error("We only support multiply-modulo whenever the multiplied polynomials are initially at most the same degree as the modulo polynomial.");
if (q1_degree == p_degree)
q1 ^= p;
uint32_t arr[q2_degree+1];
arr[0] = q1;
for (size_t i = 1; i <= q2_degree; i++) {
arr[i] = arr[i-1]<<1;
if (degree(arr[i]) == p_degree)
arr[i] ^= p;
}
uint32_t sum = 0;
std::bitset<32> b(q2);
for (size_t i = 0; i <= q2_degree; i++) {
if (b[i] == 1)
sum ^= arr[i];
}
return sum;
}
uint32_t polynomial_power_in_Z2(uint32_t q, uint32_t exp, uint32_t p) {
if (exp == 0)
return 1;
if (degree(q) == degree(p))
q ^= p;
size_t log2exp = degree(exp) + 1;
uint32_t arr[log2exp];
arr[0] = q;
for (size_t i = 1; i < log2exp; i++)
arr[i] = multiply_modulo_polynomials_in_Z2(arr[i-1], arr[i-1], p);
uint32_t res = 1;
std::bitset<32> b(exp);
for (size_t i = 0; i <= log2exp; i++) {
if (b[i] == 1)
res = multiply_modulo_polynomials_in_Z2(res, arr[i], p);
}
return res;
}
void rref_Z2(std::vector<std::bitset<32>> &A, size_t m, size_t n) {
if (m > 31 || n > 32)
throw std::logic_error("We only support rref_Z2 for up to a (31x32)-matrix.");
// calculate row echelon form
// https://en.wikipedia.org/wiki/Gaussian_elimination#Pseudocode
size_t h = 0; // Initialization of the pivot row
size_t k = 0; // Initialization of the pivot column
while (h < m && k < n) {
// Find the k-th pivot:
int i_max = -1;
for (size_t i = h; i < m; i++) {
if (A[i][k]) {
i_max = i;
break;
}
}
if (i_max == -1) {
// No pivot in this column, pass to next column
k++;
} else {
auto tmp = A[i_max];
A[i_max] = A[h];
A[h] = tmp;
// Do for all rows below pivot:
for (size_t i = h+1; i < m; i++){
bool f = A[i][k];
if (f == 0)
continue;
// Fill with zeros the lower part of pivot column:
A[i][k] = 0;
// Do for all remaining elements in current row:
for (size_t j = k+1; j<n; j++) {
A[i][j] = A[i][j] ^ A[h][j];
}
}
// Increase pivot row and column
h++;
k++;
}
}
// Perform back substitution
for (size_t i = 1; i < m; i++) {
for (size_t j = 0; j < i; j++) {
if (A[j][i] == 1) {
for (size_t l = i; l < n; l++) // check this
A[j][l] = A[j][l] ^ A[i][l];
}
}
}
}
uint32_t get_random_irreducible_polynomial_in_Z2 (size_t k) {
if (k > 31)
throw std::logic_error("We only support polynomials of degree 31 or less.");
uint32_t init_p = irreducible_polynomials[k-1];
srand(time(0));
uint32_t gamma = (rand() % (((uint32_t)1 << (k+1)) - 2)) + 2; // gamma in GF(2^k)\GF(2)
uint32_t gamma_pow[k+1];
for (size_t i = 0; i <= k; i++)
gamma_pow[i] = polynomial_power_in_Z2(gamma, i, init_p);
// Create an array of the polynomials in gamma_pow, and transpose it at the same time
std::vector<std::bitset<32>> A(k); // NOTE: That 32 > 31 >= k
for (size_t i = 0; i < k; i++) {
for (size_t j = 0; j <= k; j++) {
A[i][k-j] = gamma_pow[j] & ((uint32_t)1<<i);
}
}
rref_Z2(A, k, k+1);
std::bitset<32> bp;
for (size_t i = 0; i < k; i++) {
bp[i] = A[i][k];
}
bp[k] = 1;
uint32_t p = (uint32_t)bp.to_ulong();
return p;
}

17
general_library.hpp Normal file
View File

@ -0,0 +1,17 @@
#ifndef GENERAL_LIBRARY_H
#define GENERAL_LIBRARY_H
#include <stdlib.h>
#include <stdint.h>
#include <stdexcept>
#include <vector>
#include <bitset>
#include <iostream>
uint32_t multiplicative_inverse(int a, int n);
int len_of_shortest_period(const char * pattern, int m);
// SECTION: Functions related to calculating with polynomials in Z2[x]
uint32_t get_random_irreducible_polynomial_in_Z2 (size_t k);
#endif

View File

@ -0,0 +1,12 @@
F = GF(2)
PR = PolynomialRing(F, 'x')
max_k = 31
whitespace = max_k+1
for i in range(1, max_k+1):
poly_int = '0b'+ ''.join(map(str, PR.irreducible_element(i).coefficients(sparse=False)[::-1]))
poly_int += ','
poly_int += ' '*(max_k-i)
print(f' {poly_int}// irreducible polynomial of degree {i}')

View File

@ -0,0 +1,96 @@
k = 31
F = GF(2)
E = GF(2^k)
PR = PolynomialRing(F, 'x')
p = PR.irreducible_element(k)
while(True):
gamma = E.random_element()
if gamma not in F:
break
# gamma = PR('x^3+x')
# print('0b'+ ''.join(map(str, gamma.coefficients(sparse=False)[::-1])))
eqs = [PR(gamma)^i % p for i in range(k+1)]
for eq in eqs:
print(eq)
print()
eqs = list(map(lambda eq: eq.coefficients(sparse=False) + [0]*(k-eq.degree()-1), eqs))
for eq in eqs:
print(eq)
print()
eqs = matrix(eqs[::-1])
print(eqs)
print()
eqs = eqs.transpose()
print(eqs)
print()
rref = eqs.rref()
print(rref)
irrpol = PR(rref[:,-1].list() + [1])
print(irrpol)
print(irrpol.is_irreducible())
print()
# manual rref
def calc_rref_Z2(A, m, n):
"""Calculate the reduced row echelon form of a (mxn)-matrix"""
A = copy(A)
# calculate row echelon form
# https://en.wikipedia.org/wiki/Gaussian_elimination#Pseudocode
h = 0 # Initialization of the pivot row
k = 0 # Initialization of the pivot column
while h < m and k < n:
# Find the k-th pivot:
i_max = -1
for i in range(h, m):
if A[i,k] == 1:
i_max = i
break
if i_max == -1:
# No pivot in this column, pass to next column
k += 1
else:
tmp = A[i_max, :]
A[i_max, :] = A[h, :]
A[h, :] = tmp
# Do for all rows below pivot:
for i in range(h+1, m):
f = A[i][k];
if f == 0:
continue;
A[i, k] = 0;
for j in range(k+1, n):
A[i, j] = A[i, j] - A[h, j]
# Increase pivot row and column
h += 1
k += 1
# perform back substitution
for i in range(1, m):
for j in range(i):
if A[j, i] == 1:
A[j, :] -= A[i, :]
return A
print('Manual RREF:\n')
rref = calc_rref_Z2(eqs, k, k+1) # cols = rows+1, since we transposed the matrix
print(rref)
print()
irrpol = PR(rref[:,-1].list() + [1])
print(irrpol)
print(irrpol.is_irreducible())
print()
print('0b'+ ''.join(map(str, irrpol.coefficients(sparse=False)[::-1])))

106
hash_function_library.cpp Normal file
View File

@ -0,0 +1,106 @@
#include "hash_function_library.hpp"
int Rolling_hash::get_fingerprint() {
return (int)fingerprint;
}
Rabin_karp::Rabin_karp(uint32_t p, size_t length) : p(p) {
for (size_t i = 0; i < length; i++)
elements.push(0);
if (p >= (uint32_t)(1<<31))
throw std::overflow_error("A Rabin-Karp hash function has been initialized with too large a prime, such that we will encounter overflow errors.");
xi = (2 << length) % p;
}
void Rabin_karp::slide(unsigned char c_in) {
unsigned char c_out = elements.front();
elements.pop();
elements.push(c_in);
for (size_t i = 7; i >= 0; i--) {
bool bit_in = (c_in & (1 << i)) != 0;
bool bit_out = (c_out & (1 << i)) != 0;
slide_bit(bit_in, bit_out);
}
}
void Rabin_karp::slide_bit(bool bit_in, bool bit_out) {
fingerprint = ((fingerprint << 1) - xi*bit_out + bit_in);
// fast mod p
if (fingerprint > (uint32_t)p)
fingerprint -= p;
}
Polynomial_fingerprint::Polynomial_fingerprint(int32_t p, int32_t r) : p(p), r(r) {}
void Polynomial_fingerprint::push(unsigned char c) {
int exp = elements.size() % (p-1);
fingerprint = (fingerprint + c*(unsigned int)pow(r, exp)) % p;
elements.push(c);
}
void Polynomial_fingerprint::shift(size_t i) {
int32_t subtract_fingerprint = 0;
for (size_t ii = 0; ii < i; ii++) {
unsigned char c = elements.front();
elements.pop();
int exp = (ii+1) % (p-1);
subtract_fingerprint += c*pow(r, exp);
}
fingerprint = (fingerprint-subtract_fingerprint)*multiplicative_inverse(pow(r, i), p) % p;
}
Porat_porat_polynomial_fingerprint::Porat_porat_polynomial_fingerprint(std::string P, int32_t p, int32_t r) : p(p), r(r) {
// Calculate the shortest periods for all prefixes of length 2^i (and the full pattern)
{
size_t i = 1;
const char * P_c_str = P.c_str();
while (i < P.length()) {
shortest_periods.push_back(len_of_shortest_period(P_c_str, i));
i <<= 1;
}
shortest_periods.push_back(len_of_shortest_period(P_c_str, P.length()));
}
// Calculate the fingerprints of all prefixes of length 2^i, and of the shortest periods found before
{
size_t next_2_exponent = 1;
size_t shortest_period_index = 0;
for (size_t i = 0; i < P.length(); i++) {
if (i == next_2_exponent) {
if (!prehashed_indices.contains(i)) {
prehashed_values.push_back(get_fingerprint());
prehashed_indices[i] = prehashed_values.size()-1;
}
}
else if (shortest_period_index < shortest_periods.size() && i == shortest_periods[shortest_period_index]) {
if (!prehashed_indices.contains(i)) {
prehashed_values.push_back(get_fingerprint());
prehashed_indices[i] = prehashed_values.size()-1;
}
shortest_period_index++;
}
else {
push(P[i]);
}
}
// We have now pushed the entire pattern
prehashed_values.push_back(get_fingerprint());
prehashed_indices[P.length()] = prehashed_values.size()-1;
}
// Save the length of the pattern before we throw the pattern away
m = P.length();
}
void Porat_porat_polynomial_fingerprint::push(unsigned char c){
// TODO
}
void Porat_porat_polynomial_fingerprint::shift(size_t i){
// TODO
}

108
hash_function_library.hpp Normal file
View File

@ -0,0 +1,108 @@
#ifndef HASH_LIBRARY_H
#define HASH_LIBRARY_H
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <string>
#include <queue>
#include <stdexcept>
#include <map>
#include "general_library.hpp"
class Rolling_hash {
public:
// pushes element
virtual void push(unsigned char c);
// shifts i indices
// the hash function should already know the values
virtual void shift(size_t i);
virtual void slide(unsigned char c);
int get_fingerprint();
private:
int fingerprint;
};
class Rabin_karp : Rolling_hash {
// Hash function: sum_{i=1}^n x_i*2^{n-i} mod p
// With X being a binary string
public:
// Let prime `p` be an int32_t, which ensures that it is small enough to avoid underflows
Rabin_karp(uint32_t p, size_t length);
void slide(unsigned char c);
private:
void slide_bit(bool bit_in, bool bit_out);
std::queue<unsigned char> elements;
uint32_t fingerprint = 0;
uint32_t xi; // 2^n mod p
uint32_t p; // prime
};
class Polynomial_fingerprint : Rolling_hash {
// Hash function: sum_{i=1}^l s_i*r^i mod p
// r in F_p
// TODO: Add some overflow warning
// TODO: Untested
public:
Polynomial_fingerprint(int32_t p, int32_t r);
void push(unsigned char c);
void shift(size_t i);
private:
std::queue<unsigned char> elements;
int32_t fingerprint = 0;
int32_t p; // prime
int32_t r; // r in F_p
};
class Porat_porat_polynomial_fingerprint : Rolling_hash {
// Hash function: sum_{i=1}^l s_i*r^i mod p
// r in F_p
// TODO: Add some overflow warning
// TODO: Untested
public:
Porat_porat_polynomial_fingerprint(std::string P, int32_t p, int32_t r);
void push(unsigned char c);
void shift(size_t i);
bool should_children_be_killed();
int get_generation();
int get_child();
protected:
// TODO: Optimize the code such that we don't copy the prehashed values, but instead share it between all instances.
// I guess we would have to store it in a separate object, and then pass a reference to it around (to avoid it getting destroyed early).
Porat_porat_polynomial_fingerprint(int32_t p, int32_t r, std::vector<int32_t> prehashed_values, std::vector<unsigned int> shortest_periods, std::map<unsigned int, unsigned int> prehashed_indices);
private:
std::queue<unsigned char> elements;
int32_t fingerprint = 0;
int32_t p; // prime
int32_t r; // r in F_p
size_t m; // pattern length
std::vector<int32_t> prehashed_values;
std::vector<unsigned int> shortest_periods;
std::map<unsigned int, unsigned int> prehashed_indices;
};
/* void push(const char * cs, size_t n) { */
/* for (size_t i = 0; i < n; i++) */
/* push(cs[i]); */
/* } */
/* void push(std::string s) { */
/* for (char c : s) */
/* push(c); */
/* } */
#endif

View File

@ -0,0 +1,68 @@
import math
k = 31
F = GF(2)
var = 'x'
PR = PolynomialRing(F, var)
p = PR.irreducible_element(k)
gamma = PR.random_element(k)
exp = 10
target = gamma^exp % p
def polynomial_power_in_Z2(q, exp, p, var):
if exp == 0:
return 1
if q.degree() == p.degree():
q += p
res = q
for i in range(2, exp+1):
res = multiply_polynomials_in_Z2(res, q, p, var)
return res
def polynomial_power_in_Z2_V2(q, exp, p, var):
if exp == 0:
return 1
if q.degree() == p.degree():
q += p
log2deg = math.floor(math.log2(exp)) + 1
arr = [0]*log2deg
arr[0] = q
for i in range(1, log2deg):
arr[i] = multiply_polynomials_in_Z2(arr[i-1], arr[i-1], p, var)
res = PR(1)
for i, b in enumerate(bin(exp)[:1:-1]):
if b == '1':
res = multiply_polynomials_in_Z2(res, arr[i], p, var)
return res
def multiply_polynomials_in_Z2(q1, q2, p, var):
if q1.degree() > p.degree():
raise ValueError('Unsupported!')
if q1.degree() == p.degree():
q1 += p
arr = [0]*(q2.degree()+1)
arr[0] = q1
for i in range(1, q2.degree()+1):
arr[i] = arr[i-1]*PR(var)
if arr[i].degree() == p.degree():
arr[i] += p
return sum(arr[i] for i in q2.exponents())
print(target)
res = polynomial_power_in_Z2(gamma, exp, p, var)
print(res)
res = polynomial_power_in_Z2_V2(gamma, exp, p, var)
print(res)

226
porat-porat.cpp Normal file
View File

@ -0,0 +1,226 @@
/* #define NDEBUG */
#include <stdlib.h>
#include <string>
#include <vector>
#include <iostream>
#include <map>
#include <thread>
#include <math.h>
#include <string_view>
#include "hash_function_library.hpp"
// Initialization of constants
int p = 7919; // the prime for our hash function
int r = 11; // random int in \in F_p
char T[] = "abcabcabcdabc"; // text
/* char P[] = "abcabc"; // pattern */
char P[] = "abcabcabcabcabcabcddddddddddddddddddabc"; // pattern
std::vector<long> prehashed_values;
std::vector<int> shortest_periods;
std::map<int, int> prehashed_indices;
// https://stackoverflow.com/questions/18620942/find-the-smallest-period-of-input-string-in-on
// BEGIN stolen code
std::vector<int> calculateLPS(char * pat, int m) {
/* int[] lps = new int[pat.length()]; */
int len = 0;
int i = 1;
std::vector<int> lps = {0};
lps.resize(m);
while (i < m) {
if (pat[i] == pat[len]) {
len++;
lps[i] = len;
i++;
}
else {
if (len != 0) {
len = lps[len - 1];
}
else {
lps[i] = len;
i++;
}
}
}
return lps;
}
// calculates the length of the shortest period
int len_of_shortest_period (char * pattern, int m) {
std::vector<int> lps = calculateLPS(pattern, m);
//start at the end of the string
int i = lps.size()-1;
while (lps[i] != 0) {
//shift back
i -= lps[i];
}
return i+1;
}
// END
class porat_process {
// TODO: use a different hash function. This one is BAD
public:
// we use the polynomial fingerprint
void increment_hash (char c) {
prev_pow = prev_pow*r % p;
hash = (hash + c*prev_pow) % p;
l++;
}
void subtract_hash (long pre_fingerprint, int i) {
// i is the number of removed elements
// pre_fingerprint is the fingerprint of those previous elements
hash = (hash - pre_fingerprint)/(long)pow(r, i); // we are guaranteed that integer division will return a whole number
prev_pow /= (long)pow(r, i);
l -= i;
// TODO: untested, especially prev_pow
}
bool should_spawn_child() {
if (l == next_i_squared) {
next_i_squared <<= 1;
return true;
}
else
return false;
}
long get_fingerprint() {
return hash;
}
private:
long prev_pow = 1;
int l = 0;
int next_i_squared = 1;
long hash = 0;
};
void print_map(std::string_view comment, const std::map<int, int>& m)
{
std::cout << comment;
for (const auto& [key, value] : m) {
std::cout << key << " = " << value << "; ";
}
std::cout << "\n";
}
void print_vector(std::string_view comment, const std::vector<int>& m)
{
std::cout << comment << "[";
for (const auto& a : m) {
std::cout << a << ", ";
}
std::cout << "]\n";
}
void print_vector_long(std::string_view comment, const std::vector<long>& m)
{
std::cout << comment << "[";
for (const auto& a : m) {
std::cout << a << ", ";
}
std::cout << "]\n";
}
int main() {
int n = sizeof(T)/sizeof(char) - 1;
int m = sizeof(P)/sizeof(char) - 1;
{
int i = 1;
while (i < m) {
// calculate shortest period length
int period = len_of_shortest_period(P, i);
shortest_periods.push_back(period);
// calculate fingerprint of period
if (!prehashed_indices.contains(period)) {
porat_process process;
for (int ii = 0; ii < period; ii++){
std::cout << P[ii];
process.increment_hash(P[ii]);
}
/* prehashed_indices[period] = process.get_fingerprint(); */
prehashed_values.push_back(process.get_fingerprint());
std::cout << period << " " << prehashed_values.size() << std::endl;
prehashed_indices[period] = prehashed_values.size()-1;
}
i <<= 1;
}
if (i != m) { // so i>m, which means we skipped exactly m
// calculate shortest period length
int period = len_of_shortest_period(P, m);
shortest_periods.push_back(period);
// calculate fingerprint of period
if (!prehashed_indices.contains(period)) {
porat_process process;
std::cout << "[";
for (int ii = 0; ii < period; ii++) {
std::cout << P[ii];
process.increment_hash(P[ii]);
}
std::cout << "]\n";
prehashed_values.push_back(process.get_fingerprint());
std::cout << period << " " << prehashed_values.size() << std::endl;
prehashed_indices[period] = prehashed_values.size()-1;
}
/* // calculate fingerprint of phi(P_{2^i}) */
/* while (ii < m) { */
/* process.increment_hash(P[ii]); */
/* ii++; */
/* } */
/* prehashed_values.push_back(process.get_fingerprint()); */
}
}
{
std::cout << P << std::endl;
int i = 0;
while ((1 << i) < m) {
std::cout << "pattern: ";
for (int ii = 0; ii < (1 << i); ii++)
std::cout << P[ii];
std::cout << std::endl;
std::cout << "period: ";
for (int ii = 0; ii < shortest_periods[i]; ii++)
std::cout << P[ii];
std::cout << std::endl;
std::cout << "|prefix_{P_" << (1 << i) << "}| = " << shortest_periods[i] << std::endl;
std::cout << prehashed_values[prehashed_indices[shortest_periods[i]]] << std::endl;
i++;
}
if ((1 << i) != m) { // so i>m, which means we skipped exactly m
std::cout << "pattern: ";
for (int ii = 0; ii < m; ii++)
std::cout << P[ii];
std::cout << std::endl;
std::cout << "period: ";
for (int ii = 0; ii < shortest_periods[i]; ii++)
std::cout << P[ii];
std::cout << std::endl;
std::cout << "|prefix_{P_" << m << "}| = " << shortest_periods[i] << std::endl;
std::cout << prehashed_values[prehashed_indices[shortest_periods[i]]] << std::endl;
std::cout << prehashed_values[0] << std::endl;
std::cout << prehashed_values[1] << std::endl;
std::cout << prehashed_values[2] << std::endl;
std::cout << prehashed_values[3] << std::endl;
std::cout << prehashed_values[4] << std::endl;
}
}
print_map("Indices map: ", prehashed_indices);
print_vector_long("Values vector: ", prehashed_values);
print_vector("Periods vector: ", shortest_periods);
return EXIT_SUCCESS;
}

BIN
simple_string_matching Executable file

Binary file not shown.

View File

@ -0,0 +1,65 @@
/* #define NDEBUG */
#include "Rabin_fingerprint.hpp"
#include "general_library.hpp"
#include <iostream>
#include <stdint.h>
#include <math.h>
#include <string>
#include <fstream>
void print_match (size_t index, size_t length, std::string &T) {
std::cout << "Match found at index " << index << " with the text \"";
for (size_t i = 0; i < length; i++)
std::cout << T[index + i];
std::cout << "\"" << std::endl;
}
int main() {
/* std::ifstream ifs("books/the_complete_works_of_william_shakespeare.txt"); */
std::ifstream ifs("books/genji_monogatari_english.txt");
std::string T( (std::istreambuf_iterator<char>(ifs) ),
(std::istreambuf_iterator<char>() ) );
/* std::string T = "Hello, this is my test string averylongword is a necessary word to exceed the 32 bit window."; */
// Test without the modulo polynomial - and two matches
std::string P = "word";
// Test with the modulo polynomial
/* std::string P = "averylongword"; */
std::cout << "Searching for pattern:" << std::endl;
std::cout << " " << P << std::endl;
/* std::cout << "in text:" << std::endl; */
/* std::cout << " " << T << std::endl; */
std::cout << std::endl;
/* uint32_t polynomial = pow(2, 30) + pow(2, 2) + 1; // x^31 + x^3 + 1 */
uint32_t polynomial = get_random_irreducible_polynomial_in_Z2(31);
/* uint32_t polynomial = 0b11010011100100000111101011110111; */
// Test without the modulo polynomial
size_t window_size_in_bits = P.length()*8;
// Hash the pattern
Rabin_fingerprint fP(polynomial, window_size_in_bits);
for (char c : P)
fP.push_char(c);
// Hash the text
Rabin_fingerprint fT(polynomial, window_size_in_bits);
for (size_t i = 0; i < P.length(); i++)
fT.push_char(T[i]);
if (fT.get_fingerprint() == fP.get_fingerprint())
print_match(0, P.length(), T);
for (size_t i = P.length(); i < T.length(); i++) {
fT.slide_char(T[i], T[i-P.length()]);
if (fT.get_fingerprint() == fP.get_fingerprint())
print_match(i-P.length()+1, P.length(), T);
}
std::cout << std::endl;
std::cout << "Done!" << std::endl;
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,97 @@
from random import randint
F = GF(2)
PR = PolynomialRing(F, 'x')
k = 31
P = PR.irreducible_element(k)
m = 100
# Test the calculation for moving the left edge of the sliding window
for _ in range(1):
q = tuple(randint(0, 1) for _ in range(m))
for i in range(1, m+1):
for j in range(i+1, m+1):
assert (PR(q[:i]) % P) == (PR(q) % P) - (PR(f'x^{i}')*PR(q[i:]) % P)
# Moving the right side of the window is as described in the Rabin Fingerprint article
# Next step (TODO)
# pattern = (1,0,1,1,1,1,0,1,1,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0,1)
# period = (1,0,1,1,1,1,0,1,1,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0)
# m = len(pattern)
# s_period = tuple(list(period)+[0]*(m-len(period)))
# pattern_1 = (1)
# period_1 = (1)
# s_period_1 = tuple(list(period_1)+[0]*(m-len(period_1)))
# pattern_2 = (1,0)
# period_2 = (1,0)
# s_period_2 = tuple(list(period_2)+[0]*(m-len(period_2)))
# pattern_4 = (1,0,1,1)
# period_4 = (1,0,1)
# s_period_4 = tuple(list(period_4)+[0]*(m-len(period_4)))
# pattern_8 = (1,0,1,1,1,1,0,1)
# period_8 = (1,0,1,1,1)
# s_period_8 = tuple(list(period_8)+[0]*(m-len(period_8)))
# pattern_16 = (1,0,1,1,1,1,0,1,1,0,1,0,1,0,1,1)
# period_16 = (1,0,1,1,1,1,0,1,1,0,1,0)
# s_period_16 = tuple(list(period_16)+[0]*(m-len(period_16)))
# for i in range(len(pattern)-len(pattern_4)-len(period_4)):
# assert PR(pattern[i+len(period_4):i+len(pattern_4)+len(period_4)]) == PR(pattern[i:i+len(pattern_4)] + pattern[i+len(pattern_4):i+len(pattern_4)+len(period_4)]) - PR(s_period_4)
# assert PR(pattern[i+len(period_4):i+len(pattern_4)+len(period_4)]) == pattern[i-len(period_4):i+len(pattern_4)-len(period_4)]+pattern[i+len(pattern_4)-len(period_4):i+len(pattern_4)-len(period_4)] - PR(s_period_4)
# Does the bits of a prime number correspond to the coefficients of an irreducible polynomial?
k = 31
reducible = 0
irreducible = 0
for p in Primes()[200000000:200010000]:#[100000:100000]:
# for p in range(2147483648,2147483648+10000):
poly = PR(tuple(bin(p)[2:][::-1]))
# print(f'{poly.degree()}')
if poly.degree() > k:
break
if poly.degree() != k:
continue
irr = poly.is_irreducible()
# print(f'The polynomial {poly}, corresponding to the prime {p}, being an irreducible polynomial is {irr}.')
if irr:
irreducible += 1
else:
reducible += 1
print(f'{irreducible}/{reducible+irreducible} of the primes correspond to irreducible polynomials. That is {float(irreducible/(reducible+irreducible))*100}%')
progress = 0
the_same = 0
different = 0
for p in Primes()[200000000:200010000]:
progress += 1
print(f'\r{progress}/10000', end='')
p_poly = PR(tuple(bin(p)[2:][::-1]))
if not p_poly.is_irreducible():
continue
# i = p + randint(1,2**29)
i = p + randint(1,p-1)
# print(f'{i}, {p}, {2*p}, {i%p}')
i_poly = PR(tuple(bin(i)[2:][::-1]))
i_mod_p = i % p
i_mod_p_poly = PR(tuple(bin(i_mod_p)[2:][::-1]))
# print(f'i_poly: {i_poly}')
# print(f'p_poly: {p_poly}')
# print(f'i_mod_p_poly: {i_mod_p_poly}')
if i_mod_p_poly == i_poly % p_poly:
the_same += 1
else:
different += 1
# if not i_mod_p_poly == i_poly - (p_poly - PR(f'x^{p_poly.degree()}')):
# if not i_mod_p_poly == i_poly % p_poly:
# print()
# print('ERROR')
# print(i_mod_p_poly)
# # print(i_poly - (p_poly - PR(f'x^{p_poly.degree()}')))
# print(i_poly % p_poly)
# break
# assert i_mod_p_poly == i_poly - (p_poly - PR(f'x^{p_poly.degree()}'))
print()
print(f'{the_same}/{the_same+different} are the same as if we did polynomial calculations. This is {float(the_same/(the_same+different))*100}%')