#include "schedulers.h"
#include <sys/mman.h>
#include <cstdlib>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <chrono>

//Helper: Print Tensor (Used for steps 2,3)
void print_tensor(int* tensor, int rows, int cols) {
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            std::cout << std::setw(6) << tensor[i * cols + j] << " ";
        }
        std::cout << "\n";
    }
}

// Step 2: Tensor A (n x n) Sorting
void run_step2() {
    std::cout << "\n 2D TENSOR (n x n) ROW SORTING \n";
    int n = 5; //Matrix size 5x5 for testing

    int* tensor_A = (int*)mmap(NULL, n * n * sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
    for (int i = 0; i < n * n; ++i) tensor_A[i] = (rand() % 2001) -1000;

    std::cout << "Original 2D Tensor (Unsorted):\n";
    print_tensor(tensor_A, n, n);

    auto sort_row_task = [tensor_A, n](int row_id) {
        int* start_ptr = tensor_A + (row_id * n) ;
        std::sort(start_ptr, start_ptr + n);
    };

    std::cout << "->Executing concurrent row sorting using Prolific Scheduler\n";
    ProlificScheduler::execute(n, n, sort_row_task);

    std::cout << "Final 2D Tensor (Sorted):\n";
    print_tensor(tensor_A, n, n);

    munmap(tensor_A, n * n * sizeof(int));
}

//Step 3: Tensor B(n x n x k) Sorting
void run_step3() {
    int n = 3; //Keep n small so we can read the printout
    int k = 2; //2 workers for visualization
    std::cout << "\n 3D TENSOR (n x n x k) SLICE SORTING \n";

    int total_elements = n * n * k;
    int* tensor_B = (int*)mmap(NULL, total_elements * sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
    for (int i = 0; i < total_elements; ++i) tensor_B[i] = (rand() % 2001) -1000;

    std::cout << "Original 3D Tensor (Unsorted)\n";
    print_tensor(tensor_B, n * k, n);

    auto sort_2d_slice_task = [tensor_B, n](int slice_id) {
        int offset = slice_id * (n * n);
        for (int row = 0; row < n; ++row) {
            int* start_ptr = tensor_B + offset + (row * n);
            std::sort(start_ptr, start_ptr + n);
        }
    };

    std::cout << "-> Executing concurrent slice sorting using Collective Scheduler\n";
    //For collective, we need level. 2 levels = 3 workers, which is enough for k=2 tasks.
    CollectiveScheduler::execute(2, k, sort_2d_slice_task);

    std::cout << "Final 3D Tensor (Sorted):\n";
    print_tensor(tensor_B, n* k, n);

    munmap(tensor_B, total_elements * sizeof(int));
}

//Step 5:Benchmarking & Reporting (k=10, 100)
void run_step5() {
std::cout << "\n PERFORMANCE BENCHMARKING: PROLIFIC vs COLLECTIVE SCHEDULER \n";
int n = 100; //Larger matrix for realistic timing
int k_values[] = {10,100};

for(int k : k_values) {
    std::cout << "\nRunning Benchmark for k = " << k << " (Tensor " <<n << "x" << n << "x" << k << ")\n";
    int total_elements = n * n * k;

    int* tensor_B = (int*)mmap(NULL, total_elements * sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
    for (int i = 0; i < total_elements; ++i) tensor_B[i] = (rand() % 2001) -1000;

    auto sort_task = [tensor_B, n](int slice_id) {
        int offset = slice_id * (n* n);
        for (int row = 0; row < n; ++row) {
            int* start_prt = tensor_B + offset + (row * n);
            std::sort(start_prt, start_prt + n);
        }
    };

    //Measure Prolific
    auto start_prolific = std::chrono::high_resolution_clock::now();
    ProlificScheduler::execute(k, k, sort_task);
    auto end_prolific = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> diff_prolific = end_prolific - start_prolific;
    std::cout << "Prolific Scheduler Execution Time: " << diff_prolific.count() << "ms\n";

    //Measure Collective
    //Safety: Prevent Fork Bomb! Calculate the minimum required levels for 'k' woekrs.
    int collective_levels = std::ceil(std::log2(k +1));

    auto start_collective = std::chrono::high_resolution_clock::now();
    CollectiveScheduler::execute(collective_levels, k, sort_task);
    auto end_collective = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> diff_collective = end_collective - start_collective;
    std::cout << "Collective Scheduler Execution Time: " << diff_collective.count() << "ms\n";

    munmap(tensor_B, total_elements * sizeof(int));
  }
}

//Main 	Execution
int main() {
    srand(time(NULL));

    //Run all steps
    run_step2();
    run_step3();
    run_step5();

    std::cout << "\nHomework execution completed successfully.\n";
    return 0;
}
