Bachelors_Thesis_Code/porat-porat.cpp
2021-11-14 14:35:05 +01:00

227 lines
6.9 KiB
C++

/* #define NDEBUG */
#include <stdlib.h>
#include <string>
#include <vector>
#include <iostream>
#include <map>
#include <thread>
#include <math.h>
#include <string_view>
#include "hash_function_library.hpp"
// Initialization of constants
int p = 7919; // the prime for our hash function
int r = 11; // random int in \in F_p
char T[] = "abcabcabcdabc"; // text
/* char P[] = "abcabc"; // pattern */
char P[] = "abcabcabcabcabcabcddddddddddddddddddabc"; // pattern
std::vector<long> prehashed_values;
std::vector<int> shortest_periods;
std::map<int, int> prehashed_indices;
// https://stackoverflow.com/questions/18620942/find-the-smallest-period-of-input-string-in-on
// BEGIN stolen code
std::vector<int> calculateLPS(char * pat, int m) {
/* int[] lps = new int[pat.length()]; */
int len = 0;
int i = 1;
std::vector<int> lps = {0};
lps.resize(m);
while (i < m) {
if (pat[i] == pat[len]) {
len++;
lps[i] = len;
i++;
}
else {
if (len != 0) {
len = lps[len - 1];
}
else {
lps[i] = len;
i++;
}
}
}
return lps;
}
// calculates the length of the shortest period
int len_of_shortest_period (char * pattern, int m) {
std::vector<int> lps = calculateLPS(pattern, m);
//start at the end of the string
int i = lps.size()-1;
while (lps[i] != 0) {
//shift back
i -= lps[i];
}
return i+1;
}
// END
class porat_process {
// TODO: use a different hash function. This one is BAD
public:
// we use the polynomial fingerprint
void increment_hash (char c) {
prev_pow = prev_pow*r % p;
hash = (hash + c*prev_pow) % p;
l++;
}
void subtract_hash (long pre_fingerprint, int i) {
// i is the number of removed elements
// pre_fingerprint is the fingerprint of those previous elements
hash = (hash - pre_fingerprint)/(long)pow(r, i); // we are guaranteed that integer division will return a whole number
prev_pow /= (long)pow(r, i);
l -= i;
// TODO: untested, especially prev_pow
}
bool should_spawn_child() {
if (l == next_i_squared) {
next_i_squared <<= 1;
return true;
}
else
return false;
}
long get_fingerprint() {
return hash;
}
private:
long prev_pow = 1;
int l = 0;
int next_i_squared = 1;
long hash = 0;
};
void print_map(std::string_view comment, const std::map<int, int>& m)
{
std::cout << comment;
for (const auto& [key, value] : m) {
std::cout << key << " = " << value << "; ";
}
std::cout << "\n";
}
void print_vector(std::string_view comment, const std::vector<int>& m)
{
std::cout << comment << "[";
for (const auto& a : m) {
std::cout << a << ", ";
}
std::cout << "]\n";
}
void print_vector_long(std::string_view comment, const std::vector<long>& m)
{
std::cout << comment << "[";
for (const auto& a : m) {
std::cout << a << ", ";
}
std::cout << "]\n";
}
int main() {
int n = sizeof(T)/sizeof(char) - 1;
int m = sizeof(P)/sizeof(char) - 1;
{
int i = 1;
while (i < m) {
// calculate shortest period length
int period = len_of_shortest_period(P, i);
shortest_periods.push_back(period);
// calculate fingerprint of period
if (!prehashed_indices.contains(period)) {
porat_process process;
for (int ii = 0; ii < period; ii++){
std::cout << P[ii];
process.increment_hash(P[ii]);
}
/* prehashed_indices[period] = process.get_fingerprint(); */
prehashed_values.push_back(process.get_fingerprint());
std::cout << period << " " << prehashed_values.size() << std::endl;
prehashed_indices[period] = prehashed_values.size()-1;
}
i <<= 1;
}
if (i != m) { // so i>m, which means we skipped exactly m
// calculate shortest period length
int period = len_of_shortest_period(P, m);
shortest_periods.push_back(period);
// calculate fingerprint of period
if (!prehashed_indices.contains(period)) {
porat_process process;
std::cout << "[";
for (int ii = 0; ii < period; ii++) {
std::cout << P[ii];
process.increment_hash(P[ii]);
}
std::cout << "]\n";
prehashed_values.push_back(process.get_fingerprint());
std::cout << period << " " << prehashed_values.size() << std::endl;
prehashed_indices[period] = prehashed_values.size()-1;
}
/* // calculate fingerprint of phi(P_{2^i}) */
/* while (ii < m) { */
/* process.increment_hash(P[ii]); */
/* ii++; */
/* } */
/* prehashed_values.push_back(process.get_fingerprint()); */
}
}
{
std::cout << P << std::endl;
int i = 0;
while ((1 << i) < m) {
std::cout << "pattern: ";
for (int ii = 0; ii < (1 << i); ii++)
std::cout << P[ii];
std::cout << std::endl;
std::cout << "period: ";
for (int ii = 0; ii < shortest_periods[i]; ii++)
std::cout << P[ii];
std::cout << std::endl;
std::cout << "|prefix_{P_" << (1 << i) << "}| = " << shortest_periods[i] << std::endl;
std::cout << prehashed_values[prehashed_indices[shortest_periods[i]]] << std::endl;
i++;
}
if ((1 << i) != m) { // so i>m, which means we skipped exactly m
std::cout << "pattern: ";
for (int ii = 0; ii < m; ii++)
std::cout << P[ii];
std::cout << std::endl;
std::cout << "period: ";
for (int ii = 0; ii < shortest_periods[i]; ii++)
std::cout << P[ii];
std::cout << std::endl;
std::cout << "|prefix_{P_" << m << "}| = " << shortest_periods[i] << std::endl;
std::cout << prehashed_values[prehashed_indices[shortest_periods[i]]] << std::endl;
std::cout << prehashed_values[0] << std::endl;
std::cout << prehashed_values[1] << std::endl;
std::cout << prehashed_values[2] << std::endl;
std::cout << prehashed_values[3] << std::endl;
std::cout << prehashed_values[4] << std::endl;
}
}
print_map("Indices map: ", prehashed_indices);
print_vector_long("Values vector: ", prehashed_values);
print_vector("Periods vector: ", shortest_periods);
return EXIT_SUCCESS;
}