#include <opencv2/opencv.hpp>
#include <iostream>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <algorithm>
#include <vector>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <boost/chrono.hpp>
#include "row.hpp"
#include "sort_algorithms.hpp"
//Priority Users can set positive priorities above 0. Superusers can go up to -20
#include <sys/resource.h>
//Barrier pthread
#include <pthread.h>
using namespace std;
//Affinity CPU
bool bind_process_to_cpu(int cpuId)
{
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    CPU_SET(cpuId, &cpuset);

    int crc = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
    if (crc == 0)
    {
        cout << " bound to CPU " << cpuId << endl;
        return true;
    }
    else
    {
        cout << " failed to bind to CPU " << cpuId
              << endl;
        return false;
    }
}
// Access helper for contiguous shared video buffer
uint8_t& PIXEL(uint8_t* video, int f, int i, int j, int k, int H, int W, int C)
{
    return video[(((size_t)f * H + i) * W + j) * C + k];
}

int main(int argc, char** argv)
{
    if (argc < 3)
    {
        cout << "Usage: ./sort_blue_video input.mp4 output.mp4\n";
        return -1;
    }
    /*Barrier initialization*/
    pthread_barrier_t* barrier = (pthread_barrier_t*)mmap(
    nullptr,
    sizeof(pthread_barrier_t),
    PROT_READ | PROT_WRITE,
    MAP_SHARED | MAP_ANONYMOUS,
    -1,
    0);

	if (barrier == MAP_FAILED)
	{
    		cout << "mmap for barrier failed"<<endl;
    		return -1;
	}
    /*Eof barrier init*/
    /*Root priority*/
    int mrc=setpriority(PRIO_PROCESS, 0, 0);
    if (mrc == -1)
    {
    cout << "[Parent " << getpid() << "] setpriority failed: "
         << endl;
      _exit(0);
    }
    /*Calculate NCPUs*/
    long numCPUs = sysconf(_SC_NPROCESSORS_ONLN);
    if (numCPUs <= 0)
    {
     cout << "Could not determine number of online processors. Using 1." << endl;
     numCPUs = 1;
    }

    cout << "Number of online processors: " << numCPUs << endl;
    /*Parent affinity*/
    int parentCpu = 0 % numCPUs;
    bind_process_to_cpu(parentCpu);

    cv::VideoCapture cap(argv[1]);
    if (!cap.isOpened())
    {
        cout << "Could not open input video.\n";
        return -1;
    }
 
    int F = static_cast<int>(cap.get(cv::CAP_PROP_FRAME_COUNT));
    int W = static_cast<int>(cap.get(cv::CAP_PROP_FRAME_WIDTH));
    int H = static_cast<int>(cap.get(cv::CAP_PROP_FRAME_HEIGHT));
    int C = 3;

    double fps = cap.get(cv::CAP_PROP_FPS);
    if (fps <= 0.0) fps = 30.0;

    if (F <= 0 || W <= 0 || H <= 0)
    {
        cout << "Invalid video properties.\n";
        return -1;
    }

    cout << "Frames: " << F << ", Width: " << W << ", Height: " << H
         << ", FPS: " << fps << endl;

    // Shared memory allocation for all video data
    size_t totalBytes = (size_t)F * H * W * C * sizeof(uint8_t);

    uint8_t* video = (uint8_t*)mmap(
        nullptr,
        totalBytes,
        PROT_READ | PROT_WRITE,
        MAP_SHARED | MAP_ANONYMOUS,
        -1,
        0
    );

    if (video == MAP_FAILED)
    {
        perror("mmap failed");
        return -1;
    }

    // Read frames into shared contiguous array
    cv::Mat frame;
    int f = 0;
    while (cap.read(frame) && f < F)
    {
        if (frame.empty())
            break;

        if (frame.channels() != 3)
        {
            cout << "Expected 3-channel BGR video.\n";
            munmap(video, totalBytes);
            return -1;
        }

        for (int i = 0; i < H; i++)
        {
            for (int j = 0; j < W; j++)
            {
                cv::Vec3b pixel = frame.at<cv::Vec3b>(i, j);
                for (int k = 0; k < C; k++)
                    PIXEL(video, f, i, j, k, H, W, C) = pixel[k];
            }
        }
        f++;
    }
    cap.release();

    int actualFrames = f;
    if (actualFrames == 0)
    {
        cout << "No frames read from input video.\n";
        munmap(video, totalBytes);
        return -1;
    }

    // Use 4 forked processes
    const int K = 10;
    const int CHANNEL=2;
    vector<pid_t> children;
    cout << "\n=== Parallel row-sorting section begins ===" << endl;
    cout << "Parent PID: " << getpid() << endl;
    cout << "Creating " << K << " child processes..." << endl;
     boost::chrono::high_resolution_clock::time_point t1 =
    boost::chrono::high_resolution_clock::now();
    /*Barrier init*/
    pthread_barrierattr_t attr;
    pthread_barrierattr_init(&attr);
    pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
    const int barrierCount = K ;   // K children + no parent
    int rc_init = pthread_barrier_init(barrier, &attr, barrierCount);
    if (rc_init != 0)
    {
       cout << "pthread_barrier_init failed: " << strerror(rc_init) << endl;
       pthread_barrierattr_destroy(&attr);
       munmap(barrier, sizeof(pthread_barrier_t));
       munmap(video, totalBytes);
       return -1;
    }

   pthread_barrierattr_destroy(&attr);
    /*EOF Barrier init*/

    for (int p = 0; p < K; p++)
    {
        // cout << "[Parent " << getpid() << "] Requesting creation of child process "
        // << p << "..." << endl;
        pid_t pid = fork();

        if (pid < 0)
        {
            perror("fork failed");
            munmap(video, totalBytes);
            return -1;
        }

        if (pid == 0)
        {
	     //COW
	     pid_t mypid = getpid();
             pid_t parentpid = getppid();
     //Child affinity
      int childCpu = (p + 1) % numCPUs;
        bind_process_to_cpu(childCpu);
     // Try to raise child priority: lower nice value = higher priority
    int rc=setpriority(PRIO_PROCESS, 0, 0);
	if (rc == -1)
	{
    cout << "[Child " << getpid() << "] setpriority failed: "
         << endl;
//	_exit(0);
	} else {
        cout << "[Child index " << p << "] STARTED"
             << " | PID = " << mypid
             << " | Parent PID = " << parentpid
             << " | Priority set to high (nice = "
             << getpriority(PRIO_PROCESS, 0)
	     << " CPU="<< getpriority(PRIO_PROCESS, 0) << ")"
             << endl;
	}
	/*Wait at Barrier*/
	cout << "[Child index " << p << "] waiting at barrier..." << endl;
         pthread_barrier_wait(barrier);
        cout << "[Child index " << p << "] passed barrier, starting work." << endl;
	/*EOF wait at Barrier*/
/*	      cout << "[Child index " << p << "] STARTED"
             << " | PID = " << mypid
             << " | Parent PID = " << parentpid
             << endl;*/
            // Child process:
            // process frames p, p+K, p+2K, ...
	      cout << "[Child index " << p << "] Assigned frames: ";
             bool first = true;
	    for (int ff = p; ff < actualFrames; ff += K)
            {	
            	if (!first) cout << ", ";
            		cout << ff;
            		first = false;
            }
        	cout << endl;
	   //COW
           int rowsProcessed = 0;
           int framesProcessed = 0;

            for (int ff = p; ff < actualFrames; ff += K)
            {
		cout << "[Child index " << p << " | PID " << mypid
                 << "] Processing frame " << ff << "..." << endl;

            	framesProcessed++;
                for (int i = 0; i < H; i++)
                {
                    // Extract blue channel row into contiguous buffer
                    //uint8_t* blueRow = new uint8_t[W];
		     uint8_t* blueRow  = new uint8_t[W];
            	     uint8_t* greenRow = new uint8_t[W];
                     uint8_t* redRow   = new uint8_t[W];
                    for (int j = 0; j < W; j++) {
                       // blueRow[j] = PIXEL(video, ff, i, j, CHANNEL, H, W, C);
		        blueRow[j]  = PIXEL(video, ff, i, j, 0, H, W, C);
                	greenRow[j] = PIXEL(video, ff, i, j, 1, H, W, C);
                	redRow[j]   = PIXEL(video, ff, i, j, 2, H, W, C);
                     }
                    // Wrap with row<uint8_t> and sort
                    //row<uint8_t> r(blueRow, W);
                    //quick_sort(r);
			row<uint8_t> b(blueRow, W);
            		row<uint8_t> g(greenRow, W);
            		row<uint8_t> r(redRow, W);
			quick_sort(b);
            	        quick_sort(g);
                        quick_sort(r);

                    // Write sorted blue channel back
                    for (int j = 0; j < W; j++) {
                        //PIXEL(video, ff, i, j, CHANNEL, H, W, C) = r[j];
                        PIXEL(video, ff, i, j, 0, H, W, C) = b[j];
                        PIXEL(video, ff, i, j, 1, H, W, C) = g[j];
                        PIXEL(video, ff, i, j, 2, H, W, C) = r[j];
		    }

                    delete[] blueRow;
                    delete[] greenRow;
                    delete[] redRow;
		    //COW
		    rowsProcessed++;
                }
		 cout << "[Child index " << p << " | PID " << mypid
                 << "] Finished frame " << ff << "." << endl;
            }
	     cout << "[Child index " << p << "] FINISHED"
             << " | PID = " << mypid
             << " | Frames processed = " << framesProcessed
             << " | Rows processed = " << rowsProcessed
             << endl;

            _exit(0);
        }
        else
        {
	     cout << "[Parent " << getpid() << "] Child process " << p
             << " created successfully with PID " << pid << "." << endl;
            children.push_back(pid);
        }
    }

    // Parent waits for all children
    cout << "\n[Parent " << getpid() << "] All child processes created." << endl;
    cout << "[Parent " << getpid() << "] Waiting for children to finish..." << endl;

 /*   for (pid_t pid : children)
    {
        int status;
        waitpid(pid, &status, 0);
    }*/
    while (!children.empty())
    {
    	int status;
    	pid_t finishedPid = waitpid(-1, &status, 0);

    	if (finishedPid < 0)
    	{
        	cout << "waitpid failed" << endl;
        	break;
    	}

    	cout << "[Parent " << getpid() << "] Child with PID "
        	 << finishedPid << " has terminated";

    	if (WIFEXITED(status))
        	cout << " normally with exit code " << WEXITSTATUS(status);
    	else if (WIFSIGNALED(status))
        	cout << " due to signal " << WTERMSIG(status);

   	 cout << "." << endl;

    	auto it = std::find(children.begin(), children.end(), finishedPid);
    	if (it != children.end())
        	children.erase(it);

    	cout << "[Parent " << getpid() << "] Remaining active children: "
        	 << children.size() << endl;
   }


    boost::chrono::high_resolution_clock::time_point t2 =
    boost::chrono::high_resolution_clock::now();

   boost::chrono::milliseconds ms =
      boost::chrono::duration_cast<boost::chrono::milliseconds>(t2 - t1);
   cout << "[Parent " << getpid() << "] All " << K
     << " child processes have completed." << endl;
   cout << "Total execution time of the " << K
     << " processes: " << ms.count() << " ms" << endl;

    // Prepare writer
    cv::VideoWriter writer(
        argv[2],
	   cv::VideoWriter::fourcc('a','v','c','1'),  // Very reliable codec h264 mp4
        //cv::VideoWriter::fourcc('m', 'p', '4', 'v'),
        fps,
        cv::Size(W, H)
    );

    if (!writer.isOpened())
    {
        cout << "Could not open output video.\n";
        munmap(video, totalBytes);
        return -1;
    }

    // Write processed frames back to output video
    for (int ff = 0; ff < actualFrames; ff++)
    {
        cv::Mat outFrame(H, W, CV_8UC3);

        for (int i = 0; i < H; i++)
        {
            for (int j = 0; j < W; j++)
            {
                cv::Vec3b& pix = outFrame.at<cv::Vec3b>(i, j);
                pix[0] = PIXEL(video, ff, i, j, 0, H, W, C); // B
                pix[1] = PIXEL(video, ff, i, j, 1, H, W, C); // G
                pix[2] = PIXEL(video, ff, i, j, 2, H, W, C); // R
            }
        }

        writer.write(outFrame);
    }

    writer.release();

    // Free shared memory
    munmap(video, totalBytes);
    //Free barrier
    pthread_barrier_destroy(barrier);
    munmap(barrier, sizeof(pthread_barrier_t));
    cout << "Saved processed video to " << argv[2] << endl;
    return 0;
}
