Compare commits

..

2 Commits

Author SHA1 Message Date
b9645f15ff Better string handling
The text and pattern are now streamed directly (skipping saving them to
strings).
Knowing exactly what string was matched has now been delegated to
the Rabin_fingerprint_process.
2021-11-14 18:13:51 +01:00
c6b6329134 Abstrack simple string matching to a process
The process contains the fingerprinting function along with a queue
representing the sliding window. This means that the user no longer
needs to remember the outgoing character when streaming the text.
2021-11-14 16:19:11 +01:00
9 changed files with 134 additions and 44 deletions

View File

@ -24,8 +24,8 @@ LDLIBS = # library flags
# -llapacke (LAPACK) NOTE: The order of "-llapacke -llapack -lblas" is very important # -llapacke (LAPACK) NOTE: The order of "-llapacke -llapack -lblas" is very important
#LINK.o = $(CXX) $(LDFLAGS) # use CXX for linking #LINK.o = $(CXX) $(LDFLAGS) # use CXX for linking
simple_string_matching: simple_string_matching.o Rabin_fingerprint.o general_library.o simple_string_matching: simple_string_matching.o Rabin_fingerprint.o general_library.o processes.o
$(CXX) $(CXXFLAGS) simple_string_matching.o Rabin_fingerprint.o general_library.o -o simple_string_matching $(CXX) $(CXXFLAGS) simple_string_matching.o Rabin_fingerprint.o general_library.o processes.o -o simple_string_matching
simple_string_matching.o: simple_string_matching.cpp simple_string_matching.o: simple_string_matching.cpp
$(CXX) $(CXXFLAGS) -c simple_string_matching.cpp $(CXX) $(CXXFLAGS) -c simple_string_matching.cpp
@ -36,6 +36,9 @@ Rabin_fingerprint.o: Rabin_fingerprint.cpp Rabin_fingerprint.hpp
general_library.o: general_library.cpp general_library.hpp general_library.o: general_library.cpp general_library.hpp
$(CXX) $(CXXFLAGS) -c general_library.cpp $(CXX) $(CXXFLAGS) -c general_library.cpp
processes.o: processes.cpp processes.hpp
$(CXX) $(CXXFLAGS) -c processes.cpp
porat-porat: porat-porat.cpp porat-porat: porat-porat.cpp
# Tell the compiler that 'clean' isn't referring to a file # Tell the compiler that 'clean' isn't referring to a file

View File

@ -2,9 +2,20 @@
Code used for my Bachelor's Thesis Code used for my Bachelor's Thesis
# Issues
When reading the pattern from a file, according to UNIX standards ([read details here](https://unix.stackexchange.com/questions/18743/whats-the-point-in-adding-a-new-line-to-the-end-of-a-file)) a "text file" has to end on a newline.
This adds an unintended newline to the end of the pattern, so to avoid adding said newline, write the file using:
`echo -n word > pattern.txt`
or if you use Vim, remove the newline using
```
:set binary
:set noeol
:wq
```
# Versions # Versions
## Current Version ## Current Version
Compared to V1, this current state `generate_initial_irreducible_polynomials.sage`, `generate_random_irreducible_polynomial.sage`, and `multiply_polynomials_modulo_polynomial.sage` have been removed from the main folder, as they have already been used for implementing their corresponding features in `general_library.cpp`. They will be preserved in the V1 folder. Compared to V1, in this current state `generate_initial_irreducible_polynomials.sage`, `generate_random_irreducible_polynomial.sage`, and `multiply_polynomials_modulo_polynomial.sage` have been removed from the main folder, as they have already been used for implementing their corresponding features in `general_library.cpp`. They will be preserved in the V1 folder.
This version also contains code relating to Porat-Porat and other random code stumps. This code is going to be completely reimplemented, but is preserved for now to avoid redoing work. This version also contains code relating to Porat-Porat and other random code stumps. This code is going to be completely reimplemented, but is preserved for now to avoid redoing work.

View File

@ -14,7 +14,7 @@ class Rabin_fingerprint {
void push_bit (bool b); void push_bit (bool b);
void shift_bit (bool b); void shift_bit (bool b);
void slide_char (char c_in, char c_out); void slide_char (char c_in, char c_out);
void slide_bit (bool b1, bool b2); void slide_bit (bool b_in, bool b_out);
uint32_t get_fingerprint(); uint32_t get_fingerprint();

Binary file not shown.

1
books/pattern.txt Normal file
View File

@ -0,0 +1 @@
word

59
processes.cpp Normal file
View File

@ -0,0 +1,59 @@
#include "processes.hpp"
Rabin_fingerprint_process::Rabin_fingerprint_process(uint32_t irr_poly, size_t window_size_in_bits)
: window_size_in_bits(window_size_in_bits),
phi(irr_poly, window_size_in_bits)
{}
void Rabin_fingerprint_process::stream_char(char c) {
std::bitset<8> b(c);
for (char i = 7; i >= 0; i--) {
stream_bit((bool)b[i]);
}
}
void Rabin_fingerprint_process::stream_bit(bool b) {
if (window.size() == window_size_in_bits) {
window.push(b);
bool b_out = window.front();
window.pop();
phi.slide_bit(b, b_out);
} else {
window.push(b);
phi.push_bit(b);
}
}
uint32_t Rabin_fingerprint_process::get_fingerprint() {
return phi.get_fingerprint();
}
std::string Rabin_fingerprint_process::get_string_in_window() {
// check if window contains a whole number of chars
if ((window_size_in_bits & 0b111) != 0)
throw std::logic_error("The fingerprinting window doesn't contain a whole number of chars (counting the bits), so it doesn't make sense to return it as a string.");
#ifndef NDEBUG
if (window.size() != window_size_in_bits)
throw std::logic_error("False match! The sliding window isn't even filled yet, which means you matched the pattern of a substring shorter than the pattern. This case should be handled/avoided elsewhere, so we throw an error.");
#endif
std::ostringstream os;
for (size_t i = 0; i < window.size()>>3; i++) {
// cycle the char
char c = 0;
for (size_t j = 0; j < 8; j++) {
bool b = window.front();
window.pop();
window.push(b);
c <<= 1;
c |= b;
}
os << c;
}
std::string s = os.str();
return s;
}

26
processes.hpp Normal file
View File

@ -0,0 +1,26 @@
#ifndef PROCESSES_H
#define PROCESSES_H
#include "Rabin_fingerprint.hpp"
#include "general_library.hpp"
#include <stdint.h>
#include <queue>
#include <string>
#include <sstream>
class Rabin_fingerprint_process {
public:
Rabin_fingerprint_process(uint32_t irr_poly, size_t window_size_in_bits);
void stream_char(char c);
void stream_bit(bool b);
uint32_t get_fingerprint();
std::string get_string_in_window();
private:
std::queue<bool> window;
size_t window_size_in_bits;
Rabin_fingerprint phi;
};
#endif

BIN
simple_string_matching Executable file

Binary file not shown.

View File

@ -1,6 +1,7 @@
/* #define NDEBUG */ /* #define NDEBUG */
#include "Rabin_fingerprint.hpp" /* #include "Rabin_fingerprint.hpp" */
#include "general_library.hpp" #include "general_library.hpp"
#include "processes.hpp"
#include <iostream> #include <iostream>
#include <stdint.h> #include <stdint.h>
@ -8,53 +9,42 @@
#include <string> #include <string>
#include <fstream> #include <fstream>
void print_match (size_t index, size_t length, std::string &T) {
std::cout << "Match found at index " << index << " with the text \"";
for (size_t i = 0; i < length; i++)
std::cout << T[index + i];
std::cout << "\"" << std::endl;
}
int main() { int main() {
/* std::ifstream ifs("books/the_complete_works_of_william_shakespeare.txt"); */ /* std::ifstream T("books/the_complete_works_of_william_shakespeare.txt"); */
std::ifstream ifs("books/genji_monogatari_english.txt"); std::ifstream T("books/genji_monogatari_english.txt");
std::string T( (std::istreambuf_iterator<char>(ifs) ), std::ifstream P("books/pattern.txt");
(std::istreambuf_iterator<char>() ) );
/* std::string T = "Hello, this is my test string averylongword is a necessary word to exceed the 32 bit window."; */
// Test without the modulo polynomial - and two matches
std::string P = "word";
// Test with the modulo polynomial
/* std::string P = "averylongword"; */
char c;
size_t P_length = 0;
std::cout << "Searching for pattern:" << std::endl; std::cout << "Searching for pattern:" << std::endl;
std::cout << " " << P << std::endl; std::cout << " ";
/* std::cout << "in text:" << std::endl; */ while(P.get(c)) {
/* std::cout << " " << T << std::endl; */ std::cout << c;
std::cout << std::endl; P_length++;
/* uint32_t polynomial = pow(2, 30) + pow(2, 2) + 1; // x^31 + x^3 + 1 */ }
uint32_t polynomial = get_random_irreducible_polynomial_in_Z2(31); std::cout << std::endl << std::endl;
/* uint32_t polynomial = 0b11010011100100000111101011110111; */
// Test without the modulo polynomial P.clear(); // clear fail and eof bits
size_t window_size_in_bits = P.length()*8; P.seekg(0, std::ios::beg); // back to the start!
uint32_t irreducible_polynomial = get_random_irreducible_polynomial_in_Z2(31);
size_t window_size_in_bits = P_length*8;
// Hash the pattern // Hash the pattern
Rabin_fingerprint fP(polynomial, window_size_in_bits); Rabin_fingerprint_process phiP(irreducible_polynomial, (size_t)window_size_in_bits);
for (char c : P) while(P.get(c)) {
fP.push_char(c); phiP.stream_char(c);
}
// Hash the text // Hash the text
Rabin_fingerprint fT(polynomial, window_size_in_bits); Rabin_fingerprint_process phiT(irreducible_polynomial, window_size_in_bits);
for (size_t i = 0; i < P.length(); i++) size_t index = 0;
fT.push_char(T[i]); while(T.get(c)) {
if (fT.get_fingerprint() == fP.get_fingerprint()) phiT.stream_char(c);
print_match(0, P.length(), T); index++;
if (phiT.get_fingerprint() == phiP.get_fingerprint())
for (size_t i = P.length(); i < T.length(); i++) { std::cout << "Match found at index " << index-P_length << " with the text \"" << phiT.get_string_in_window() << "\"" << std::endl;
fT.slide_char(T[i], T[i-P.length()]);
if (fT.get_fingerprint() == fP.get_fingerprint())
print_match(i-P.length()+1, P.length(), T);
} }
std::cout << std::endl; std::cout << std::endl;