#include "hashing_algorithms.hpp"

#include "hashed_string.hpp"

#include <string.h>


// p is prime. p > m > 1, p > a > 0, p > b >= 0
uint64_t multiply_mod_prime(uint64_t x, uint64_t a, uint64_t b, uint64_t p, uint64_t m) {
    return ((a*x+b) % p) % m;
}

// p is a Mersenne prime. p > m > 1, p > a > 0, p > b >= 0
uint64_t multiply_mod_prime_mersenne(uint64_t x, uint64_t a, uint64_t b, uint64_t p, uint64_t m) {
    uint64_t y = a*x+b;
    y = (y&p)+(y>>p);
    if (y>=p) y-=p;
    return y % m;
}

// p is a Mersenne prime. m=2^q. p > m > 1, p > a > 0, p > b >= 0
uint64_t multiply_mod_prime_mersenne_overflow(uint64_t x, uint64_t a, uint64_t b, uint64_t p, char q) {
    uint64_t y = a*x+b;
    y = (y&p)+(y>>p);
    if (y>=p) y-=p;
    return y & ~( ( ~(uint64_t)0 ) << q); // OBS: Behaviour undefined for shifting n-bit integers n times
}

// p is a Mersenne prime. m=2^q. p > m > 1, p > a > 0
uint64_t multiply_mod_prime_mersenne_overflow_no_b(uint64_t x, uint64_t a, uint64_t p, char q) {
    uint64_t y = a*x;
    y = (y&p)+(y>>p);
    if (y>=p) y-=p;
    return y & ~( ( ~(uint64_t)0 ) << q); // OBS: Behaviour undefined for shifting n-bit integers n times
}

// p=2^89-1 is a Mersenne prime. m=2^l. 32 >= l > 0. p > a > 0, p > b >= 0. x is an 64 bit integer. We assume x, a, and b are arrays of 32 bit integers.
uint32_t multiply_mod_prime_mersenne_overflow_high_bitcount(uint32_t * x, uint32_t * a, uint32_t * b, char l) {
    // x is a 64 bit integer given as a 2 long array of 32 bit integers
    // a is a 89 bit integer given as a 3 long array of 32 bit integers
    // b is a 89 bit integer given as a 3 long array of 32 bit integers

    // ax: array to hold the sub-calculations of a*x
    // there are 2*3 sub-calculations with each result being split into the least significant 32 bits and the most significant 32 bits
    uint32_t ax[12]; // 12 = 2*3*2

    for (size_t i = 0; i < 2; i++) { // index x
        for (size_t ii = 0; ii < 3; ii++) { // index a
            uint64_t tmp = (uint64_t)x[i] * (uint64_t)a[ii];
            ax[6*i+2*ii] = (uint32_t)(tmp & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
            ax[6*i+2*ii+1] = (uint32_t)(tmp >> 32);
        }
    }

    // calculate y = ax+b
    uint32_t * y = new uint32_t[5];
    uint64_t sum = 0;
    uint32_t carry = 0;

    // calculate bits 0-32
    sum = (uint64_t)ax[10]+(uint64_t)b[2];
    carry = (uint32_t)(sum >> 32);
    y[4] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
    // calculate bits 33-64
    sum = (uint64_t)carry + (uint64_t)ax[11] + (uint64_t)ax[8] + (uint64_t)ax[4] + (uint64_t)b[1];
    carry = (uint32_t)(sum >> 32);
    y[3] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
    // calculate bits 65-96
    sum = (uint64_t)carry + (uint64_t)ax[9] + (uint64_t)ax[5] + (uint64_t)ax[2] + (uint64_t)ax[6] + (uint64_t)b[0];
    carry = (uint32_t)(sum >> 32);
    y[2] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
    // calculate bits 97-128
    sum = (uint64_t)carry + (uint64_t)ax[3] + (uint64_t)ax[7] + (uint64_t)ax[0];
    carry = (uint32_t)(sum >> 32);
    y[1] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
    // calculate bits 129-160
    y[0] = ax[1];

    //// calculate modulo p for p=2^89-1
    // y&p
    // we take the 89 first bits
    uint32_t yandp[3];
    yandp[2] = y[4];
    yandp[1] = y[3];
    yandp[0] = (y[2] & 0b11111'11111'11111'11111'11111); // 25 ones

    // y>>q
    // we bitshift 89 times, so we only keep the 5*32 - 89 = 71 most significant bits
    uint32_t yshiftq[3];
    yshiftq[2] = (y[2] >> 25) // keep 7 bits
                 | (y[1] << 7); // keep 25 bits
    yshiftq[1] = (y[1] >> 25) | (y[0] << 7);
    yshiftq[0] = y[0] >> 25;

    // y = (y&p) + (y >> q)
    // bits 0-32
    sum = (uint64_t)yandp[2] + (uint64_t)yshiftq[2];
    carry = (uint32_t)(sum >> 32);
    y[4] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
    // bits 33-64
    sum = (uint64_t)carry + (uint64_t)yandp[1] + (uint64_t)yshiftq[1];
    carry = (uint32_t)(sum >> 32);
    y[3] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
    // bits 65-71
    y[2] = (uint64_t)carry + (uint64_t)yandp[0] + (uint64_t)yshiftq[0];
    y[1] = 0;
    // y[0] = 0;

    //// if y >= p; y -= p
    // y >= p if bits 90 to 96 are != 0 (actually if bit 90 is set, but this is prettier)
    if ((y[2] >> 25) != 0) {
        // subtracting 2^89-1 is equal to subtracting 2^89 and adding 1.
        // - 2^89
        y[2] = y[2] & 0b1'1111'1111'1111'1111'1111'1111; // 25 ones. We know that bit 91 to 96 are 0
        // + 1
        for (size_t i = 4; i > 1; i--) {
            y[i] += 1;
            if (y[i] != 0)
                break;
        }
    }

    // mod 2^l (mod m)
    return (y[4] >> (32-l));
}

uint64_t multiply_shift_c_universal(uint32_t x, uint64_t a, char l) {
    return (a*x) >> (64-l);
}

uint64_t multiply_shift_strongly_universal(uint32_t x, uint64_t a, uint64_t b, char l) {
    return (a*x+b) >> (64-l);
}

uint64_t multiply_shift_vector(uint32_t * x, uint64_t * seed, size_t d, char l) {
    uint64_t val = 0;
    for (size_t i = 0; i < d; i++)
        val += seed[i]*x[i];
    return (val + seed[d-1]) >> (64-l);
}

// requires x to be of size D and the seed to be of size D
uint64_t multiply_shift_string(const My_string * string, const uint64_t * seed, uint64_t * x, size_t l) {
    size_t d = (string->size+7) >> 3; // d = ceil(string->size/8)

    x[d-1] = 0;
    memcpy(x, string->chars, string->size*sizeof(string->chars[0]));

    uint64_t val = 0;
    for (size_t i = 0; i < d; i++)
        val += (seed[2*i]+(uint32_t)(x[i]>>32))*(seed[2*i+1]+(uint32_t)x[i]);
    return (val + seed[d]) >> (64-l);
}

// calculate (ax+b) mod p
// p=2^89-1 is a Mersenne prime. p > a. p > x. a and x are size 3 arrays of uint32_t.
// b is a 64 bit integer given as a size 2 array of uint32_t.
// The result is saved in x.
void high_bitcount_ax_b_mod_p(uint32_t * x, uint32_t * a, uint32_t * b) {
    // x, a, and b are 89 bit integers given as a 3 long arrays of 32 bit integers.

    // ax: array to hold the sub-calculations of a*x
    // there are 3*3 sub-calculations with each result being split into the least significant 32 bits and the most significant 32 bits
    size_t ax_size = 18; // 18 = 3*3*2
    uint32_t ax[ax_size];

    for (size_t i = 0; i < 3; i++) { // index x
        for (size_t ii = 0; ii < 3; ii++) { // index a
            uint64_t tmp = (uint64_t)x[i] * (uint64_t)a[ii];
            ax[6*i+2*ii] = (uint32_t)(tmp & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
            ax[6*i+2*ii+1] = (uint32_t)(tmp >> 32);
        }
    }

    // calculate y = ax+b
    size_t y_size = 6; // 6 = ceil( (89+89+1)/32 )
    uint32_t * y = new uint32_t[y_size];
    uint64_t sum = 0;
    uint32_t carry = 0;
    y[0] = ax[0]+b[0];
    for (size_t index = 1; index < y_size-1; index++) {
        sum = 0;
        for (size_t i = 0; i <= index; i++) {
            size_t ii = index - i;
            sum += ax[6*i+2*ii];
        }
        for (size_t i = 0; i < index; i++) {
            size_t ii = index - i - 1;
            sum += ax[6*i+2*ii+1];
        }
        if (index < 2) {
            sum += b[index];
        }
        carry = (uint32_t)(sum >> 32);
        y[index] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
    }
    y[y_size-1] = carry+ax[ax_size-1];

    //// calculate modulo p for p=2^89-1
    // y&p
    // we take the 89 first bits
    uint32_t yandp[3];
    yandp[0] = y[0];
    yandp[1] = y[1];
    yandp[2] = (y[2] & 0b11111'11111'11111'11111'11111); // 25 ones

    // y>>q
    // we bitshift 89 times, and keep the 89 following bits
    uint32_t yshiftq[3];
    yshiftq[0] = (y[2] >> 25) // keep 7 bits
                 | (y[3] << 7); // keep 25 bits
    yshiftq[1] = (y[3] >> 25) | (y[4] << 7);
    yshiftq[2] = (y[4] >> 25) // keep 7 bits
                 | (y[5] << 7); // keep 25 bits, but only 18 of them can be nonzero in practice.

    // y = (y&p) + (y >> q)
    // bits 0-32
    sum = (uint64_t)yandp[0] + (uint64_t)yshiftq[0];
    carry = (uint32_t)(sum >> 32);
    y[0] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
    // bits 33-64
    sum = (uint64_t)carry + (uint64_t)yandp[1] + (uint64_t)yshiftq[1];
    carry = (uint32_t)(sum >> 32);
    y[1] = (uint32_t)(sum & 0b1111'1111'1111'1111'1111'1111'1111'1111); // 32 ones
    // bits 65-71
    y[2] = (uint64_t)carry + (uint64_t)yandp[2] + (uint64_t)yshiftq[2];
    // y[3] = 0; // we don't use it anyway
    // y[4] = 0;
    // y[5] = 0;

    //// if y >= p; y -= p
    // y >= p if bits 90 to 96 are != 0 (actually if bit 90 is set, but this is prettier)
    if ((y[2] << 25) != 0) {
        // subtracting 2^89-1 is equal to subtracting 2^89 and adding 1.
        // - 2^89
        y[2] = y[2] & 0b1'1111'1111'1111'1111'1111'1111; // 25 ones. We know that bit 91 to 96 are 0
        // + 1
        for (size_t i = 0; i < 3; i++) {
            y[i] += 1;
            if (y[i] != 0)
                break;
        }
    }

    x[0] = y[0];
    x[1] = y[1];
    x[2] = y[2];

    return;
}


// p=2^89-1 is a Mersenne prime.
// p > a >= 0. p > b >= 0. p > c >= 0. x is a size 2d list of 64 bit integers split into 32 bit integers.
// We assume a, b, and c are size 3 arrays of 32 bit integers.
uint32_t polynomial_vector(uint32_t * x, uint32_t * a, uint32_t * b, uint32_t * c, size_t d, char l) {
    uint32_t H[3];
    H[0] = x[0];
    H[1] = x[1];

    for (size_t i = 1; i < d; i++) {
        high_bitcount_ax_b_mod_p(H, c, x+2*i);
    }
    high_bitcount_ax_b_mod_p(H, a, b);

    return H[0] >> (32-l);
}

// p=2^89-1 is a Mersenne prime.
// p > a >= 0. p > b >= 0. p > c >= 0. x is a size 2d list of 64 bit integers split into 32 bit integers.
// We assume a, b, and c are size 3 arrays of 32 bit integers.
// we assume the seed to be of size 4 (at minimum).
uint32_t polynomial_vector_tuned(uint32_t * x, uint32_t * a, uint32_t * b, uint32_t * c, size_t d, char l, const uint64_t * seed) {
    size_t x_remainder = d - ((d >> 2) << 2); // abuse integer division and multiplication (via bitshifts) as modulo
    size_t x_tuned_size = (d >> 2) + 1;
    uint32_t x_tuned[x_tuned_size];

    // prehash chunks of x using the bounded string algorithm
    uint64_t buffer_memory[4];
    char word[256];
    for (size_t i = 0; i <= (d >> 4) - 1; i++ ) { // d 64 bit integers -> d/4 256 char strings
        memcpy(word, x + i*4, 256*sizeof(word[0]));
        My_string str = My_string(word, 256);
        x_tuned[i] = (uint32_t) multiply_shift_string(&str, seed, buffer_memory, 32);
    }
    // prehash the leftovers
    if (x_remainder != 0) {
        memcpy(word, x + d-x_remainder, (x_remainder << 3)*sizeof(word[0]));
        My_string str = My_string(word, (x_remainder << 3));
        x_tuned[x_tuned_size-1] = (uint32_t) multiply_shift_string(&str, seed, buffer_memory, 32);
    }

    return polynomial_vector(x_tuned, a, b, c, x_tuned_size, l);
}