diff --git a/README.md b/README.md index 7c8d1c7..1c41f2e 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,20 @@ Code used for my Bachelor's Thesis +# Issues +When reading the pattern from a file, according to UNIX standards ([read details here](https://unix.stackexchange.com/questions/18743/whats-the-point-in-adding-a-new-line-to-the-end-of-a-file)) a "text file" has to end on a newline. +This adds an unintended newline to the end of the pattern, so to avoid adding said newline, write the file using: +`echo -n word > pattern.txt` +or if you use Vim, remove the newline using +``` +:set binary +:set noeol +:wq +``` + # Versions ## Current Version -Compared to V1, this current state `generate_initial_irreducible_polynomials.sage`, `generate_random_irreducible_polynomial.sage`, and `multiply_polynomials_modulo_polynomial.sage` have been removed from the main folder, as they have already been used for implementing their corresponding features in `general_library.cpp`. They will be preserved in the V1 folder. +Compared to V1, in this current state `generate_initial_irreducible_polynomials.sage`, `generate_random_irreducible_polynomial.sage`, and `multiply_polynomials_modulo_polynomial.sage` have been removed from the main folder, as they have already been used for implementing their corresponding features in `general_library.cpp`. They will be preserved in the V1 folder. This version also contains code relating to Porat-Porat and other random code stumps. This code is going to be completely reimplemented, but is preserved for now to avoid redoing work. diff --git a/V1/simple_string_matching b/V1/simple_string_matching index 54fb419..03d9198 100755 Binary files a/V1/simple_string_matching and b/V1/simple_string_matching differ diff --git a/books/pattern.txt b/books/pattern.txt new file mode 100644 index 0000000..387155e --- /dev/null +++ b/books/pattern.txt @@ -0,0 +1 @@ +word \ No newline at end of file diff --git a/processes.cpp b/processes.cpp index 3f53510..ec18695 100644 --- a/processes.cpp +++ b/processes.cpp @@ -5,14 +5,14 @@ Rabin_fingerprint_process::Rabin_fingerprint_process(uint32_t irr_poly, size_t w phi(irr_poly, window_size_in_bits) {} -void Rabin_fingerprint_process::stream_char (char c) { +void Rabin_fingerprint_process::stream_char(char c) { std::bitset<8> b(c); for (char i = 7; i >= 0; i--) { stream_bit((bool)b[i]); } } -void Rabin_fingerprint_process::stream_bit (bool b) { +void Rabin_fingerprint_process::stream_bit(bool b) { if (window.size() == window_size_in_bits) { window.push(b); bool b_out = window.front(); @@ -24,6 +24,36 @@ void Rabin_fingerprint_process::stream_bit (bool b) { } } -uint32_t Rabin_fingerprint_process::get_fingerprint () { +uint32_t Rabin_fingerprint_process::get_fingerprint() { return phi.get_fingerprint(); } + +std::string Rabin_fingerprint_process::get_string_in_window() { + // check if window contains a whole number of chars + if ((window_size_in_bits & 0b111) != 0) + throw std::logic_error("The fingerprinting window doesn't contain a whole number of chars (counting the bits), so it doesn't make sense to return it as a string."); + + #ifndef NDEBUG + if (window.size() != window_size_in_bits) + throw std::logic_error("False match! The sliding window isn't even filled yet, which means you matched the pattern of a substring shorter than the pattern. This case should be handled/avoided elsewhere, so we throw an error."); + #endif + + std::ostringstream os; + for (size_t i = 0; i < window.size()>>3; i++) { + // cycle the char + char c = 0; + for (size_t j = 0; j < 8; j++) { + bool b = window.front(); + window.pop(); + window.push(b); + c <<= 1; + c |= b; + } + + os << c; + } + + std::string s = os.str(); + + return s; +} diff --git a/processes.hpp b/processes.hpp index 05561b1..a412d32 100644 --- a/processes.hpp +++ b/processes.hpp @@ -6,6 +6,8 @@ #include #include +#include +#include class Rabin_fingerprint_process { public: @@ -13,6 +15,7 @@ class Rabin_fingerprint_process { void stream_char(char c); void stream_bit(bool b); uint32_t get_fingerprint(); + std::string get_string_in_window(); private: std::queue window; diff --git a/simple_string_matching b/simple_string_matching index 4dd6519..2140c26 100755 Binary files a/simple_string_matching and b/simple_string_matching differ diff --git a/simple_string_matching.cpp b/simple_string_matching.cpp index 735ceac..075f99c 100644 --- a/simple_string_matching.cpp +++ b/simple_string_matching.cpp @@ -1,6 +1,6 @@ /* #define NDEBUG */ /* #include "Rabin_fingerprint.hpp" */ -/* #include "general_library.hpp" */ +#include "general_library.hpp" #include "processes.hpp" #include @@ -9,46 +9,42 @@ #include #include -void print_match (size_t index, size_t length, std::string &T) { - std::cout << "Match found at index " << index << " with the text \""; - for (size_t i = 0; i < length; i++) - std::cout << T[index + i]; - std::cout << "\"" << std::endl; -} - int main() { - /* std::ifstream ifs("books/the_complete_works_of_william_shakespeare.txt"); */ - std::ifstream ifs("books/genji_monogatari_english.txt"); - std::string T( (std::istreambuf_iterator(ifs) ), - (std::istreambuf_iterator() ) ); - - std::string P = "word"; + /* std::ifstream T("books/the_complete_works_of_william_shakespeare.txt"); */ + std::ifstream T("books/genji_monogatari_english.txt"); + std::ifstream P("books/pattern.txt"); + char c; + size_t P_length = 0; std::cout << "Searching for pattern:" << std::endl; - std::cout << " " << P << std::endl; - /* std::cout << "in text:" << std::endl; */ - /* std::cout << " " << T << std::endl; */ - std::cout << std::endl; + std::cout << " "; + while(P.get(c)) { + std::cout << c; + P_length++; + + } + std::cout << std::endl << std::endl; + + P.clear(); // clear fail and eof bits + P.seekg(0, std::ios::beg); // back to the start! uint32_t irreducible_polynomial = get_random_irreducible_polynomial_in_Z2(31); - size_t window_size_in_bits = P.length()*8; + size_t window_size_in_bits = P_length*8; // Hash the pattern Rabin_fingerprint_process phiP(irreducible_polynomial, (size_t)window_size_in_bits); - for (char c : P) + while(P.get(c)) { phiP.stream_char(c); + } // Hash the text Rabin_fingerprint_process phiT(irreducible_polynomial, window_size_in_bits); - for (size_t i = 0; i < P.length(); i++) - phiT.stream_char(T[i]); - if (phiT.get_fingerprint() == phiP.get_fingerprint()) - print_match(0, P.length(), T); - - for (size_t i = P.length(); i < T.length(); i++) { - phiT.stream_char(T[i]); + size_t index = 0; + while(T.get(c)) { + phiT.stream_char(c); + index++; if (phiT.get_fingerprint() == phiP.get_fingerprint()) - print_match(i-P.length()+1, P.length(), T); + std::cout << "Match found at index " << index-P_length << " with the text \"" << phiT.get_string_in_window() << "\"" << std::endl; } std::cout << std::endl;