#include <pthread.h>
#include <atomic>
#include <vector>
#include <chrono>
#include "row.hpp"
#include "sort_algorithms.hpp"

struct MatrixSortContext {
    uint8_t* A;
    int F, H, W;
};

struct StealArgs {
    MatrixSortContext* ctx;
    std::atomic<int>* global_counter;
    int total_rows;
    int chunk_size;
};

void* steal_worker(void* arg) {
    StealArgs* sa = (StealArgs*)arg;
    int start;
    int c_size = sa->chunk_size;

    // Δυναμική λήψη chunks (Work Stealing logic)
    while ((start = sa->global_counter->fetch_add(c_size)) < sa->total_rows) {
        int end = std::min(start + c_size, sa->total_rows);
        for (int i = start; i < end; ++i) {
            uint8_t* row_ptr = sa->ctx->A + (static_cast<size_t>(i) * sa->ctx->W);
            // Χρήση της row<uint8_t> για in-place ταξινόμηση
            row<uint8_t> r(row_ptr, static_cast<uint32_t>(sa->ctx->W), false);
            quick_sort(r);
        }
    }
    return nullptr;
}

long long run_steal_scheduler(MatrixSortContext* ctx, int K, int chunk_size = 32) {
    int total_rows = ctx->F * ctx->H;
    std::atomic<int> global_counter(0);
    pthread_t threads[K];
    StealArgs args[K];

    auto t1 = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < K; ++i) {
        args[i] = {ctx, &global_counter, total_rows, chunk_size};
        pthread_create(&threads[i], nullptr, steal_worker, &args[i]);
    }
    for (int i = 0; i < K; ++i) pthread_join(threads[i], nullptr);
    auto t2 = std::chrono::high_resolution_clock::now();

    return std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
}