Better string handling

The text and pattern are now streamed directly (skipping saving them to
strings).
Knowing exactly what string was matched has now been delegated to
the Rabin_fingerprint_process.
This commit is contained in:
Knyffen 2021-11-14 18:13:51 +01:00
parent c6b6329134
commit b9645f15ff
7 changed files with 73 additions and 32 deletions

View File

@ -2,9 +2,20 @@
Code used for my Bachelor's Thesis
# Issues
When reading the pattern from a file, according to UNIX standards ([read details here](https://unix.stackexchange.com/questions/18743/whats-the-point-in-adding-a-new-line-to-the-end-of-a-file)) a "text file" has to end on a newline.
This adds an unintended newline to the end of the pattern, so to avoid adding said newline, write the file using:
`echo -n word > pattern.txt`
or if you use Vim, remove the newline using
```
:set binary
:set noeol
:wq
```
# Versions
## Current Version
Compared to V1, this current state `generate_initial_irreducible_polynomials.sage`, `generate_random_irreducible_polynomial.sage`, and `multiply_polynomials_modulo_polynomial.sage` have been removed from the main folder, as they have already been used for implementing their corresponding features in `general_library.cpp`. They will be preserved in the V1 folder.
Compared to V1, in this current state `generate_initial_irreducible_polynomials.sage`, `generate_random_irreducible_polynomial.sage`, and `multiply_polynomials_modulo_polynomial.sage` have been removed from the main folder, as they have already been used for implementing their corresponding features in `general_library.cpp`. They will be preserved in the V1 folder.
This version also contains code relating to Porat-Porat and other random code stumps. This code is going to be completely reimplemented, but is preserved for now to avoid redoing work.

Binary file not shown.

1
books/pattern.txt Normal file
View File

@ -0,0 +1 @@
word

View File

@ -5,14 +5,14 @@ Rabin_fingerprint_process::Rabin_fingerprint_process(uint32_t irr_poly, size_t w
phi(irr_poly, window_size_in_bits)
{}
void Rabin_fingerprint_process::stream_char (char c) {
void Rabin_fingerprint_process::stream_char(char c) {
std::bitset<8> b(c);
for (char i = 7; i >= 0; i--) {
stream_bit((bool)b[i]);
}
}
void Rabin_fingerprint_process::stream_bit (bool b) {
void Rabin_fingerprint_process::stream_bit(bool b) {
if (window.size() == window_size_in_bits) {
window.push(b);
bool b_out = window.front();
@ -24,6 +24,36 @@ void Rabin_fingerprint_process::stream_bit (bool b) {
}
}
uint32_t Rabin_fingerprint_process::get_fingerprint () {
uint32_t Rabin_fingerprint_process::get_fingerprint() {
return phi.get_fingerprint();
}
std::string Rabin_fingerprint_process::get_string_in_window() {
// check if window contains a whole number of chars
if ((window_size_in_bits & 0b111) != 0)
throw std::logic_error("The fingerprinting window doesn't contain a whole number of chars (counting the bits), so it doesn't make sense to return it as a string.");
#ifndef NDEBUG
if (window.size() != window_size_in_bits)
throw std::logic_error("False match! The sliding window isn't even filled yet, which means you matched the pattern of a substring shorter than the pattern. This case should be handled/avoided elsewhere, so we throw an error.");
#endif
std::ostringstream os;
for (size_t i = 0; i < window.size()>>3; i++) {
// cycle the char
char c = 0;
for (size_t j = 0; j < 8; j++) {
bool b = window.front();
window.pop();
window.push(b);
c <<= 1;
c |= b;
}
os << c;
}
std::string s = os.str();
return s;
}

View File

@ -6,6 +6,8 @@
#include <stdint.h>
#include <queue>
#include <string>
#include <sstream>
class Rabin_fingerprint_process {
public:
@ -13,6 +15,7 @@ class Rabin_fingerprint_process {
void stream_char(char c);
void stream_bit(bool b);
uint32_t get_fingerprint();
std::string get_string_in_window();
private:
std::queue<bool> window;

Binary file not shown.

View File

@ -1,6 +1,6 @@
/* #define NDEBUG */
/* #include "Rabin_fingerprint.hpp" */
/* #include "general_library.hpp" */
#include "general_library.hpp"
#include "processes.hpp"
#include <iostream>
@ -9,46 +9,42 @@
#include <string>
#include <fstream>
void print_match (size_t index, size_t length, std::string &T) {
std::cout << "Match found at index " << index << " with the text \"";
for (size_t i = 0; i < length; i++)
std::cout << T[index + i];
std::cout << "\"" << std::endl;
}
int main() {
/* std::ifstream ifs("books/the_complete_works_of_william_shakespeare.txt"); */
std::ifstream ifs("books/genji_monogatari_english.txt");
std::string T( (std::istreambuf_iterator<char>(ifs) ),
(std::istreambuf_iterator<char>() ) );
std::string P = "word";
/* std::ifstream T("books/the_complete_works_of_william_shakespeare.txt"); */
std::ifstream T("books/genji_monogatari_english.txt");
std::ifstream P("books/pattern.txt");
char c;
size_t P_length = 0;
std::cout << "Searching for pattern:" << std::endl;
std::cout << " " << P << std::endl;
/* std::cout << "in text:" << std::endl; */
/* std::cout << " " << T << std::endl; */
std::cout << std::endl;
std::cout << " ";
while(P.get(c)) {
std::cout << c;
P_length++;
}
std::cout << std::endl << std::endl;
P.clear(); // clear fail and eof bits
P.seekg(0, std::ios::beg); // back to the start!
uint32_t irreducible_polynomial = get_random_irreducible_polynomial_in_Z2(31);
size_t window_size_in_bits = P.length()*8;
size_t window_size_in_bits = P_length*8;
// Hash the pattern
Rabin_fingerprint_process phiP(irreducible_polynomial, (size_t)window_size_in_bits);
for (char c : P)
while(P.get(c)) {
phiP.stream_char(c);
}
// Hash the text
Rabin_fingerprint_process phiT(irreducible_polynomial, window_size_in_bits);
for (size_t i = 0; i < P.length(); i++)
phiT.stream_char(T[i]);
size_t index = 0;
while(T.get(c)) {
phiT.stream_char(c);
index++;
if (phiT.get_fingerprint() == phiP.get_fingerprint())
print_match(0, P.length(), T);
for (size_t i = P.length(); i < T.length(); i++) {
phiT.stream_char(T[i]);
if (phiT.get_fingerprint() == phiP.get_fingerprint())
print_match(i-P.length()+1, P.length(), T);
std::cout << "Match found at index " << index-P_length << " with the text \"" << phiT.get_string_in_window() << "\"" << std::endl;
}
std::cout << std::endl;