227 lines
6.9 KiB
C++
227 lines
6.9 KiB
C++
/* #define NDEBUG */
|
|
|
|
#include <stdlib.h>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <iostream>
|
|
#include <map>
|
|
#include <thread>
|
|
#include <math.h>
|
|
|
|
#include <string_view>
|
|
|
|
#include "hash_function_library.hpp"
|
|
|
|
// Initialization of constants
|
|
int p = 7919; // the prime for our hash function
|
|
int r = 11; // random int in \in F_p
|
|
char T[] = "abcabcabcdabc"; // text
|
|
/* char P[] = "abcabc"; // pattern */
|
|
char P[] = "abcabcabcabcabcabcddddddddddddddddddabc"; // pattern
|
|
std::vector<long> prehashed_values;
|
|
std::vector<int> shortest_periods;
|
|
std::map<int, int> prehashed_indices;
|
|
|
|
// https://stackoverflow.com/questions/18620942/find-the-smallest-period-of-input-string-in-on
|
|
// BEGIN stolen code
|
|
std::vector<int> calculateLPS(char * pat, int m) {
|
|
/* int[] lps = new int[pat.length()]; */
|
|
int len = 0;
|
|
int i = 1;
|
|
std::vector<int> lps = {0};
|
|
lps.resize(m);
|
|
|
|
while (i < m) {
|
|
if (pat[i] == pat[len]) {
|
|
len++;
|
|
lps[i] = len;
|
|
i++;
|
|
}
|
|
else {
|
|
if (len != 0) {
|
|
len = lps[len - 1];
|
|
}
|
|
else {
|
|
lps[i] = len;
|
|
i++;
|
|
}
|
|
}
|
|
}
|
|
return lps;
|
|
}
|
|
|
|
// calculates the length of the shortest period
|
|
int len_of_shortest_period (char * pattern, int m) {
|
|
std::vector<int> lps = calculateLPS(pattern, m);
|
|
//start at the end of the string
|
|
int i = lps.size()-1;
|
|
while (lps[i] != 0) {
|
|
//shift back
|
|
i -= lps[i];
|
|
}
|
|
return i+1;
|
|
}
|
|
// END
|
|
|
|
class porat_process {
|
|
// TODO: use a different hash function. This one is BAD
|
|
public:
|
|
// we use the polynomial fingerprint
|
|
void increment_hash (char c) {
|
|
prev_pow = prev_pow*r % p;
|
|
hash = (hash + c*prev_pow) % p;
|
|
l++;
|
|
}
|
|
|
|
void subtract_hash (long pre_fingerprint, int i) {
|
|
// i is the number of removed elements
|
|
// pre_fingerprint is the fingerprint of those previous elements
|
|
hash = (hash - pre_fingerprint)/(long)pow(r, i); // we are guaranteed that integer division will return a whole number
|
|
prev_pow /= (long)pow(r, i);
|
|
l -= i;
|
|
// TODO: untested, especially prev_pow
|
|
}
|
|
|
|
bool should_spawn_child() {
|
|
if (l == next_i_squared) {
|
|
next_i_squared <<= 1;
|
|
return true;
|
|
}
|
|
else
|
|
return false;
|
|
}
|
|
|
|
long get_fingerprint() {
|
|
return hash;
|
|
}
|
|
|
|
private:
|
|
long prev_pow = 1;
|
|
int l = 0;
|
|
int next_i_squared = 1;
|
|
long hash = 0;
|
|
};
|
|
|
|
void print_map(std::string_view comment, const std::map<int, int>& m)
|
|
{
|
|
std::cout << comment;
|
|
for (const auto& [key, value] : m) {
|
|
std::cout << key << " = " << value << "; ";
|
|
}
|
|
std::cout << "\n";
|
|
}
|
|
|
|
void print_vector(std::string_view comment, const std::vector<int>& m)
|
|
{
|
|
std::cout << comment << "[";
|
|
for (const auto& a : m) {
|
|
std::cout << a << ", ";
|
|
}
|
|
std::cout << "]\n";
|
|
}
|
|
|
|
void print_vector_long(std::string_view comment, const std::vector<long>& m)
|
|
{
|
|
std::cout << comment << "[";
|
|
for (const auto& a : m) {
|
|
std::cout << a << ", ";
|
|
}
|
|
std::cout << "]\n";
|
|
}
|
|
|
|
int main() {
|
|
int n = sizeof(T)/sizeof(char) - 1;
|
|
int m = sizeof(P)/sizeof(char) - 1;
|
|
|
|
{
|
|
int i = 1;
|
|
while (i < m) {
|
|
// calculate shortest period length
|
|
int period = len_of_shortest_period(P, i);
|
|
shortest_periods.push_back(period);
|
|
|
|
// calculate fingerprint of period
|
|
if (!prehashed_indices.contains(period)) {
|
|
porat_process process;
|
|
for (int ii = 0; ii < period; ii++){
|
|
std::cout << P[ii];
|
|
process.increment_hash(P[ii]);
|
|
}
|
|
/* prehashed_indices[period] = process.get_fingerprint(); */
|
|
prehashed_values.push_back(process.get_fingerprint());
|
|
std::cout << period << " " << prehashed_values.size() << std::endl;
|
|
prehashed_indices[period] = prehashed_values.size()-1;
|
|
}
|
|
|
|
i <<= 1;
|
|
}
|
|
if (i != m) { // so i>m, which means we skipped exactly m
|
|
// calculate shortest period length
|
|
int period = len_of_shortest_period(P, m);
|
|
shortest_periods.push_back(period);
|
|
|
|
// calculate fingerprint of period
|
|
if (!prehashed_indices.contains(period)) {
|
|
porat_process process;
|
|
std::cout << "[";
|
|
for (int ii = 0; ii < period; ii++) {
|
|
std::cout << P[ii];
|
|
process.increment_hash(P[ii]);
|
|
}
|
|
std::cout << "]\n";
|
|
prehashed_values.push_back(process.get_fingerprint());
|
|
std::cout << period << " " << prehashed_values.size() << std::endl;
|
|
prehashed_indices[period] = prehashed_values.size()-1;
|
|
}
|
|
/* // calculate fingerprint of phi(P_{2^i}) */
|
|
/* while (ii < m) { */
|
|
/* process.increment_hash(P[ii]); */
|
|
/* ii++; */
|
|
/* } */
|
|
/* prehashed_values.push_back(process.get_fingerprint()); */
|
|
}
|
|
}
|
|
|
|
{
|
|
std::cout << P << std::endl;
|
|
int i = 0;
|
|
while ((1 << i) < m) {
|
|
std::cout << "pattern: ";
|
|
for (int ii = 0; ii < (1 << i); ii++)
|
|
std::cout << P[ii];
|
|
std::cout << std::endl;
|
|
std::cout << "period: ";
|
|
for (int ii = 0; ii < shortest_periods[i]; ii++)
|
|
std::cout << P[ii];
|
|
std::cout << std::endl;
|
|
std::cout << "|prefix_{P_" << (1 << i) << "}| = " << shortest_periods[i] << std::endl;
|
|
std::cout << prehashed_values[prehashed_indices[shortest_periods[i]]] << std::endl;
|
|
i++;
|
|
}
|
|
if ((1 << i) != m) { // so i>m, which means we skipped exactly m
|
|
std::cout << "pattern: ";
|
|
for (int ii = 0; ii < m; ii++)
|
|
std::cout << P[ii];
|
|
std::cout << std::endl;
|
|
std::cout << "period: ";
|
|
for (int ii = 0; ii < shortest_periods[i]; ii++)
|
|
std::cout << P[ii];
|
|
std::cout << std::endl;
|
|
std::cout << "|prefix_{P_" << m << "}| = " << shortest_periods[i] << std::endl;
|
|
std::cout << prehashed_values[prehashed_indices[shortest_periods[i]]] << std::endl;
|
|
std::cout << prehashed_values[0] << std::endl;
|
|
std::cout << prehashed_values[1] << std::endl;
|
|
std::cout << prehashed_values[2] << std::endl;
|
|
std::cout << prehashed_values[3] << std::endl;
|
|
std::cout << prehashed_values[4] << std::endl;
|
|
}
|
|
}
|
|
|
|
print_map("Indices map: ", prehashed_indices);
|
|
print_vector_long("Values vector: ", prehashed_values);
|
|
print_vector("Periods vector: ", shortest_periods);
|
|
|
|
return EXIT_SUCCESS;
|
|
}
|
|
|