Better string handling

The text and pattern are now streamed directly (skipping saving them to
strings).
Knowing exactly what string was matched has now been delegated to
the Rabin_fingerprint_process.
This commit is contained in:
Knyffen 2021-11-14 18:13:51 +01:00
parent c6b6329134
commit b9645f15ff
7 changed files with 73 additions and 32 deletions

View File

@ -2,9 +2,20 @@
Code used for my Bachelor's Thesis Code used for my Bachelor's Thesis
# Issues
When reading the pattern from a file, according to UNIX standards ([read details here](https://unix.stackexchange.com/questions/18743/whats-the-point-in-adding-a-new-line-to-the-end-of-a-file)) a "text file" has to end on a newline.
This adds an unintended newline to the end of the pattern, so to avoid adding said newline, write the file using:
`echo -n word > pattern.txt`
or if you use Vim, remove the newline using
```
:set binary
:set noeol
:wq
```
# Versions # Versions
## Current Version ## Current Version
Compared to V1, this current state `generate_initial_irreducible_polynomials.sage`, `generate_random_irreducible_polynomial.sage`, and `multiply_polynomials_modulo_polynomial.sage` have been removed from the main folder, as they have already been used for implementing their corresponding features in `general_library.cpp`. They will be preserved in the V1 folder. Compared to V1, in this current state `generate_initial_irreducible_polynomials.sage`, `generate_random_irreducible_polynomial.sage`, and `multiply_polynomials_modulo_polynomial.sage` have been removed from the main folder, as they have already been used for implementing their corresponding features in `general_library.cpp`. They will be preserved in the V1 folder.
This version also contains code relating to Porat-Porat and other random code stumps. This code is going to be completely reimplemented, but is preserved for now to avoid redoing work. This version also contains code relating to Porat-Porat and other random code stumps. This code is going to be completely reimplemented, but is preserved for now to avoid redoing work.

Binary file not shown.

1
books/pattern.txt Normal file
View File

@ -0,0 +1 @@
word

View File

@ -27,3 +27,33 @@ void Rabin_fingerprint_process::stream_bit (bool b) {
uint32_t Rabin_fingerprint_process::get_fingerprint() { uint32_t Rabin_fingerprint_process::get_fingerprint() {
return phi.get_fingerprint(); return phi.get_fingerprint();
} }
std::string Rabin_fingerprint_process::get_string_in_window() {
// check if window contains a whole number of chars
if ((window_size_in_bits & 0b111) != 0)
throw std::logic_error("The fingerprinting window doesn't contain a whole number of chars (counting the bits), so it doesn't make sense to return it as a string.");
#ifndef NDEBUG
if (window.size() != window_size_in_bits)
throw std::logic_error("False match! The sliding window isn't even filled yet, which means you matched the pattern of a substring shorter than the pattern. This case should be handled/avoided elsewhere, so we throw an error.");
#endif
std::ostringstream os;
for (size_t i = 0; i < window.size()>>3; i++) {
// cycle the char
char c = 0;
for (size_t j = 0; j < 8; j++) {
bool b = window.front();
window.pop();
window.push(b);
c <<= 1;
c |= b;
}
os << c;
}
std::string s = os.str();
return s;
}

View File

@ -6,6 +6,8 @@
#include <stdint.h> #include <stdint.h>
#include <queue> #include <queue>
#include <string>
#include <sstream>
class Rabin_fingerprint_process { class Rabin_fingerprint_process {
public: public:
@ -13,6 +15,7 @@ class Rabin_fingerprint_process {
void stream_char(char c); void stream_char(char c);
void stream_bit(bool b); void stream_bit(bool b);
uint32_t get_fingerprint(); uint32_t get_fingerprint();
std::string get_string_in_window();
private: private:
std::queue<bool> window; std::queue<bool> window;

Binary file not shown.

View File

@ -1,6 +1,6 @@
/* #define NDEBUG */ /* #define NDEBUG */
/* #include "Rabin_fingerprint.hpp" */ /* #include "Rabin_fingerprint.hpp" */
/* #include "general_library.hpp" */ #include "general_library.hpp"
#include "processes.hpp" #include "processes.hpp"
#include <iostream> #include <iostream>
@ -9,46 +9,42 @@
#include <string> #include <string>
#include <fstream> #include <fstream>
void print_match (size_t index, size_t length, std::string &T) {
std::cout << "Match found at index " << index << " with the text \"";
for (size_t i = 0; i < length; i++)
std::cout << T[index + i];
std::cout << "\"" << std::endl;
}
int main() { int main() {
/* std::ifstream ifs("books/the_complete_works_of_william_shakespeare.txt"); */ /* std::ifstream T("books/the_complete_works_of_william_shakespeare.txt"); */
std::ifstream ifs("books/genji_monogatari_english.txt"); std::ifstream T("books/genji_monogatari_english.txt");
std::string T( (std::istreambuf_iterator<char>(ifs) ), std::ifstream P("books/pattern.txt");
(std::istreambuf_iterator<char>() ) );
std::string P = "word";
char c;
size_t P_length = 0;
std::cout << "Searching for pattern:" << std::endl; std::cout << "Searching for pattern:" << std::endl;
std::cout << " " << P << std::endl; std::cout << " ";
/* std::cout << "in text:" << std::endl; */ while(P.get(c)) {
/* std::cout << " " << T << std::endl; */ std::cout << c;
std::cout << std::endl; P_length++;
}
std::cout << std::endl << std::endl;
P.clear(); // clear fail and eof bits
P.seekg(0, std::ios::beg); // back to the start!
uint32_t irreducible_polynomial = get_random_irreducible_polynomial_in_Z2(31); uint32_t irreducible_polynomial = get_random_irreducible_polynomial_in_Z2(31);
size_t window_size_in_bits = P.length()*8; size_t window_size_in_bits = P_length*8;
// Hash the pattern // Hash the pattern
Rabin_fingerprint_process phiP(irreducible_polynomial, (size_t)window_size_in_bits); Rabin_fingerprint_process phiP(irreducible_polynomial, (size_t)window_size_in_bits);
for (char c : P) while(P.get(c)) {
phiP.stream_char(c); phiP.stream_char(c);
}
// Hash the text // Hash the text
Rabin_fingerprint_process phiT(irreducible_polynomial, window_size_in_bits); Rabin_fingerprint_process phiT(irreducible_polynomial, window_size_in_bits);
for (size_t i = 0; i < P.length(); i++) size_t index = 0;
phiT.stream_char(T[i]); while(T.get(c)) {
phiT.stream_char(c);
index++;
if (phiT.get_fingerprint() == phiP.get_fingerprint()) if (phiT.get_fingerprint() == phiP.get_fingerprint())
print_match(0, P.length(), T); std::cout << "Match found at index " << index-P_length << " with the text \"" << phiT.get_string_in_window() << "\"" << std::endl;
for (size_t i = P.length(); i < T.length(); i++) {
phiT.stream_char(T[i]);
if (phiT.get_fingerprint() == phiP.get_fingerprint())
print_match(i-P.length()+1, P.length(), T);
} }
std::cout << std::endl; std::cout << std::endl;