#include <iostream>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <random>
#include <unistd.h>
#include <sys/mman.h>
#include <sched.h>
#include <boost/chrono.hpp>

#include "row.hpp"
#include "sort_algorithms.hpp"
#include "bounded_prolific_scheduler.hpp"

using namespace std;

bool bind_process_to_cpu(int cpuId)
{
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    CPU_SET(cpuId, &cpuset);

    int rc = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
    return rc == 0;
}

inline uint8_t& CELL(uint8_t* A, int f, int i, int j, int H, int W)
{
    return A[(static_cast<size_t>(f) * H + i) * W + j];
}

struct MatrixSortContext
{
    uint8_t* A;
    int F;
    int H;
    int W;
};

void sort_matrix_task(int ff, int workerId, int workerCount, void* ctxPtr)
{
    (void)workerId;
    (void)workerCount;

    MatrixSortContext* ctx = static_cast<MatrixSortContext*>(ctxPtr);

    uint8_t* A = ctx->A;
    const int H = ctx->H;
    const int W = ctx->W;

    for (int i = 0; i < H; i++)
    {
        uint8_t* rowPtr = &CELL(A, ff, i, 0, H, W);

        row<uint8_t> currentRow(
            rowPtr,
            static_cast<uint32_t>(W),
            false
        );

        quick_sort(currentRow);
    }
}

int main(int argc, char** argv)
{
    if (argc < 4)
    {
        cout << "Usage: ./main F H W\n";
        return -1;
    }

    int F = atoi(argv[1]);
    int H = atoi(argv[2]);
    int W = atoi(argv[3]);

    if (F <= 0 || H <= 0 || W <= 0)
    {
        cout << "Invalid matrix dimensions.\n";
        return -1;
    }

    long numCPUs = sysconf(_SC_NPROCESSORS_ONLN);
    if (numCPUs <= 0)
    {
        numCPUs = 1;
    }

    int parentCpu = 0 % numCPUs;
    bind_process_to_cpu(parentCpu);

    if (static_cast<size_t>(F) > numeric_limits<size_t>::max() / static_cast<size_t>(H) / static_cast<size_t>(W))
    {
        cout << "Matrix too large.\n";
        return -1;
    }

    const size_t totalCells =
        static_cast<size_t>(F) *
        static_cast<size_t>(H) *
        static_cast<size_t>(W);

    const size_t totalBytes = totalCells * sizeof(uint8_t);

    uint8_t* A = static_cast<uint8_t*>(mmap(
        nullptr,
        totalBytes,
        PROT_READ | PROT_WRITE,
        MAP_SHARED | MAP_ANONYMOUS,
        -1,
        0
    ));

    if (A == MAP_FAILED)
    {
        perror("mmap failed");
        return -1;
    }

    random_device rd;
    mt19937 gen(rd());
    uniform_int_distribution<int> dist(0, 255);

    for (size_t idx = 0; idx < totalCells; idx++)
    {
        A[idx] = static_cast<uint8_t>(dist(gen));
    }

    const int K = 16;

    MatrixSortContext ctx;
    ctx.A = A;
    ctx.F = F;
    ctx.H = H;
    ctx.W = W;

    boost::chrono::high_resolution_clock::time_point t1 =
        boost::chrono::high_resolution_clock::now();

    int schedRc = run_bounded_prolific_scheduler(
        F,
        K,
        sort_matrix_task,
        &ctx,
        true
    );

    if (schedRc != 0)
    {
        cout << "Scheduler failed.\n";
        munmap(A, totalBytes);
        return -1;
    }

    boost::chrono::high_resolution_clock::time_point t2 =
        boost::chrono::high_resolution_clock::now();

    boost::chrono::milliseconds ms =
        boost::chrono::duration_cast<boost::chrono::milliseconds>(t2 - t1);

    cout << "[Parent " << getpid() << "] Worker processes completed." << endl;
    cout << "Total execution time: " << ms.count() << " ms" << endl;

    munmap(A, totalBytes);

    return 0;
}