/////////////////////////////////////////////////////////////////////////
// Zero Intelligence Traders
// Author:      Dale K. Brearcliffe
// Email:       dbrearcl@gmu.edu
// Date:        1 February 2017
// Version:     1.0
// Inspired by: Dr. Robert Axtell's C pThread version
// Based on:    Gode and Sunder, QJE, 1993
// Usage:       ./zit #, where # is an integer for the number of threads
//
// Copyright 2017 Dale K. Brearcliffe
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// 
//   http://www.apache.org/licenses/LICENSE-2.0
// 
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/////////////////////////////////////////////////////////////////////////

#include <math.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
// CUDA's default random number generator
#include <curand.h>
#include <curand_kernel.h>

///////////////////////////////////////////////////////////////////////
// Define some values for later use
///////////////////////////////////////////////////////////////////////

#define false 0
#define true 1

// Used for debugging...
#define verbose false
// Used for outputing data in a single row
#define dataCollect false

#define buyer true
#define seller false

// Number of trades to take place
#define MaxNumberOfTrades 100000000

// Specify the number of agents, the same for each type
#define numberOfAgents 1000000

// Specify the maximum internal values for buyers and seller budgets
#define maxBuyerValue 30
#define maxSellerValue 30

// Define an Agent's internal structure
typedef struct TraderAgent {
   int buyerOrSeller; // Is this agent a buyer or seller?
   int quantityHeld;  // How many items does the agent have?
   int value;         // What is the value of the item?
   int price;         // What was the purchase or sale price?
} Agent; // Agent Structure

//////////////////////////////////////////////////////////////////////////
// CUDA commands require error checking designed for their environment.
// A macro is defined to be used as a wrapper around the CUDA command.
// This is a commonly known and used macro that is found on the Internet
// in several formats.
// This variant was found on StackOverflow at URL: http://stackoverflow.com/
//    questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-
//    the-cuda-runtime-api
//////////////////////////////////////////////////////////////////////////

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
   if (code != cudaSuccess) {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

//////////////////////////////////////////////////////////////////////////
// GPU kernel's are sections of code that will be passed from the host
// (CPU) to the device (GPU). 
//////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////
// GPU Kernel
// Will seed a pseudorandom number generator for each thread so no thread
// follows the same sequence.
//////////////////////////////////////////////////////////////////////////

__global__ void init_curandom(unsigned int randomSeed, curandState_t* statesOfRandom) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   curand_init(randomSeed+index, 0, 0, &statesOfRandom[index]);
}; // KERNEL init_curandom()

//////////////////////////////////////////////////////////////////////////
// GPU Kernel
// Given a range of buyer and seller agents, select a random buyer and
// random seller. If a trade is possible, then it occurs and each agent's
// status is updated.
// This same kernel will be run for each thread.
//////////////////////////////////////////////////////////////////////////

__global__ void DoTrades (int maxTrades, Agent *Buyers, Agent *Sellers, curandState_t* statesOfRandom, int agentsPerThread, int tradesPerThread) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   // Calculate the lower and upper buyer IDs boundaries based on index
   int LBB = index * agentsPerThread;
   int UBB = (index + 1) * agentsPerThread - 1;
   // Calculate the buyer's ID range, make sure it isn't zero
   int RBB = UBB - LBB;
   if (RBB==0) {
      RBB=1;
   }
   // Calculate the lower and upper seller IDs boundaries based on index
   int LSB = index * agentsPerThread;
   int USB = (index + 1) * agentsPerThread - 1;
   int RSB = USB - LSB;
   // Calculate the seller's ID range, make sure it isn't zero   
   if (RSB==0) {
      RSB=1;
   }
   
   int buyerID, bidPrice, sellerID, askPrice, transactionPrice;
   
   // Try 'tradesPerThread' times to make a trade
   for (int i=0; i<tradesPerThread; i++) {
      // Needs to be -1 if debugging is turned on
      transactionPrice = -1;
      // Select a random buyer from with the range of buyers allocated to this thread
      buyerID = LBB + curand(&statesOfRandom[index]) % RBB;
      // Pick a random bid price not to exceed the buyer agent's maximum value
      bidPrice = (curand(&statesOfRandom[index]) % Buyers[buyerID].value) + 1;
      // Select a random seller from with the range of sellers allocated to this thread
      sellerID = LSB + curand(&statesOfRandom[index]) % RSB;
      // Pick a random ask price not to be less than the seller agent's minimum value
      askPrice = Sellers[sellerID].value + (curand(&statesOfRandom[index]) % (maxSellerValue - Sellers[sellerID].value + 1));
      // If the buyer has no items, the seller has one, and the bid is 
      // greater than or equal to the ask, conduct the trade
      if ((Buyers[buyerID].quantityHeld == 0) && (Sellers[sellerID].quantityHeld == 1) && (bidPrice >= askPrice)) {
         // Transaction occurs somewhere between the bid and ask price
         transactionPrice = askPrice + curand(&statesOfRandom[index]) % (bidPrice - askPrice + 1);
         // Update the agent records
         Buyers[buyerID].price = transactionPrice;
         Sellers[sellerID].price = transactionPrice;
         Buyers[buyerID].quantityHeld = 1;
         Sellers[sellerID].quantityHeld = 0;
      }
      if (verbose==true) {
         // printf("Index: %i Block: %i | i=%i (Buyer[%i] Bid = %i & Seller[%i] Ask = %i) ==> Trans = %i\n", index, blockIdx.x, i, buyerID, Buyers[buyerID].value, sellerID, Sellers[sellerID].value, transactionPrice);
      }
   }
}; // KERNEL DoTrades()

//////////////////////////////////////////////////////////////////////////
//
// MAIN
//
//////////////////////////////////////////////////////////////////////////

int main(int argc, char *argv[])  {
   
   // Grab the number of desired threads from the command line
   int threadTarget;
   if(argc==2) {
      threadTarget = atoi(argv[1]);
   }
   else if(argc>2) {
      printf("Excess arguments have been ignored.\n");
   }
   else {
      printf("ERROR: Thread target missing.\n");
      return(0);
   }
   
   // Display a friendly header
   if (dataCollect==false) {
      printf("\nZERO INTELLIGENCE TRADERS\n");
   }
   
   // Each CUDA device has a unique set of properties
   // All are displayed for debugging, but only some are used in the code
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, 0);
   if (verbose==true) {
      printf("Name: %s\n", prop.name);
      printf("TotalGlobalMem: %lu\n", prop.totalGlobalMem);
      printf("SharedMemPerBlock: %lu\n", prop.sharedMemPerBlock);
      printf("RegsPerBlock: %i\n", prop.regsPerBlock);
      printf("WarpSize: %i\n", prop.warpSize);
      printf("MemPitch: %lu\n", prop.memPitch);
      printf("MaxThreadsPerBlock: %i\n", prop.maxThreadsPerBlock);
      printf("MaxThreadsDim: [%i, %i, %i]\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
      printf("MaxGridSize: [%i, %i, %i]\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
      printf("TotalConstMem: %lu\n", prop.totalConstMem);
      printf("Major Revision: %i\n", prop.major);
      printf("Minor Revision: %i\n", prop.minor);
      printf("ClockRate: %i\n", prop.clockRate);
      printf("TextureAlignment: %lu\n", prop.textureAlignment);
      printf("DeviceOverlap: %i\n", prop.deviceOverlap);
      printf("MultiProcessorCount: %i\n", prop.multiProcessorCount);
      printf("KernelExecTimeoutEnabled: %i\n", prop.kernelExecTimeoutEnabled);
      printf("Integrated: %i\n", prop.integrated);
      printf("CanMapHostMemory: %i\n", prop.canMapHostMemory);
      printf("ComputeMode: %i\n", prop.computeMode);
      printf("ConcurrentKernels: %i\n", prop.concurrentKernels);
      printf("ECCEnabled: %i\n", prop.ECCEnabled);
      printf("pciBusID: %i\n", prop.pciBusID);
      printf("pciDeviceID: %i\n", prop.pciDeviceID);
      printf("tccDriver: %i\n", prop.tccDriver);
      printf("\n");
   }
   
   // Initialize pseudorandom number generator with random seed
   // This is the standard 'C' routine used on the host (CPU) 
   srand(time(NULL));
   
   // Each computation block has a limited number of threads that can run
   // in parallel
   int maxThreadsPerBlock = prop.maxThreadsPerBlock;
   // Threads will be run in groups the size of a warp
   // While there can be a group run that is less than a multiple of warp,
   // the hardware will 'round up' to the nearest warp size
   int warpSize = prop.warpSize;
   // The number of desired threads is divided into blocks with each block
   // containing some number of threads (up to maxThreadsPerBlock)
   // Note: These are integer calculations - fractions are dropped along the way
   int numBlocks = (threadTarget + maxThreadsPerBlock - 1) / maxThreadsPerBlock;
   int numThreads = threadTarget / numBlocks / warpSize * warpSize;
   // A special case where the number of threads is less than the warp size
   // Ensures there are not zero threads
   if (numThreads<warpSize) {
      numThreads = threadTarget;
   }
   // The actual threads will be less than or equal to the thread target
   // as some multiple of the warp size
   int actualThreads = numBlocks * numThreads;
   // Bail out if there are more threads than agents
   // While code could have fixed this, it is a logical error to need more
   // threads than agents
   if (actualThreads>numberOfAgents) {
      printf("ERROR! Number of threads (%i) cannot be larger than number of agents (%i)\n",actualThreads,numberOfAgents);
      return(1);
   }
   // Set up and populate the buyer and seller agents
   // A memory type unique to CUDA is used: Unified Memory
   // "Unified Memory creates a pool of managed memory that is shared between 
   // the CPU and GPU, bridging the CPU-GPU divide. Managed memory is 
   // accessible to both the CPU and GPU using a single pointer. The key is 
   // that the system automatically migrates data allocated in Unified Memory 
   // between host and device so that it looks like CPU memory to code running 
   // on the CPU, and like GPU memory to code running on the GPU."
   // https://devblogs.nvidia.com/parallelforall/unified-memory-in-cuda-6/
   Agent *Buyers, *Sellers;
   gpuErrchk( cudaMallocManaged(&Buyers, numberOfAgents*sizeof(Agent)) );
   gpuErrchk( cudaMallocManaged(&Sellers, numberOfAgents*sizeof(Agent)) );
   // Initialize agents 
   int i;
   // First the buyers
   for (i=0; i<numberOfAgents; i++)
   {
      Buyers[i].buyerOrSeller = buyer;
      Buyers[i].quantityHeld = 0;
      Buyers[i].value = (rand() % maxBuyerValue) + 1;
      Buyers[i].price = 0;
   };
   // Now the sellers
   for (i=0; i<numberOfAgents; i++)
   {
      Sellers[i].buyerOrSeller = seller;
      Sellers[i].quantityHeld = 1;
      Sellers[i].value = (rand() % maxSellerValue) + 1;
      Sellers[i].price = 0;
   };
   // For each thread, how many agents to consider?
   int agentsPerThread = numberOfAgents / actualThreads;
   // Safety check - must have at least one agent per thread
   if (agentsPerThread<1) {
      agentsPerThread=1;
   }
   // For each thread, how many trades to conduct?
   int tradesPerThread = MaxNumberOfTrades / actualThreads;
   // Safety check - must have at least one trade per thread
   if (tradesPerThread<1) {
      tradesPerThread=1;
   }

   curandState_t* statesOfRandom;
   // Allocate space on the GPU for the pseudorandom states
   // This will only be accessed by the device so cudaMalloc is used
   gpuErrchk( cudaMalloc((void**) &statesOfRandom, actualThreads * sizeof(curandState_t)) );
   // Use a GPU kernel to initialize the pseudorandom states for each thread
   // time(0) is sent as the seed   
   init_curandom<<<numBlocks, numThreads>>>(time(0), statesOfRandom);
   // Check for errors
   gpuErrchk( cudaPeekAtLastError() );
   // Wait here (block) until all threads are done
   gpuErrchk( cudaDeviceSynchronize() );
   
   // Capture the amount of time spent in the DoTrades kernel
   // Set up an event and get the start time
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
   cudaEventRecord(start);
   // Execute all trades on the GPU
   DoTrades<<<numBlocks, numThreads>>>(MaxNumberOfTrades, Buyers, Sellers, statesOfRandom, agentsPerThread, tradesPerThread);
   // Check for errors
   gpuErrchk( cudaPeekAtLastError() );
   // Wait here (block) until all threads are done
   gpuErrchk( cudaDeviceSynchronize() );
   // Get the event end time
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);
   // Calculate the elapsed time an store in timeDoTrades
   float timeDoTrades = 0;
   cudaEventElapsedTime(&timeDoTrades, start, stop);
   
   // Compute the statistics
   float totalTraded = 0.0;
   int numBought = 0;
   int numSold = 0;
   float sum1 = 0.0;
   float sum2 = 0.0;
   // First the buyers
   for (i=0; i<numberOfAgents; i++) {
      if (Buyers[i].quantityHeld==1) {
         numBought+=1;
         sum1+=Buyers[i].price;
         sum2+=pow(Buyers[i].price, 2.0);
      }
   }
   // Now the sellers
   for (i=0; i<numberOfAgents; i++) {
      if (Sellers[i].quantityHeld==0) {
         numSold+=1;
         sum1+=Sellers[i].price;
         sum2+=pow(Sellers[i].price, 2.0);
      }
   }
   // Finalize the stats
   totalTraded = numBought + numSold;
   float priceAvg = sum1/totalTraded;
   float sd = sqrt((sum2 - totalTraded * pow(priceAvg, 2.0)) / (totalTraded - 1.0));
   
   // Display results
   if (dataCollect==true) { 
      printf("%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%f,%f,%f\n",MaxNumberOfTrades, numberOfAgents, threadTarget, actualThreads, numBlocks, numThreads, agentsPerThread, tradesPerThread, numBought, numSold, priceAvg, sd, timeDoTrades);
   }
   else {
      printf("Number of Blocks:   %i\n", numBlocks);
      printf("Threads Per Block:  %i\n", numThreads);
      printf("Thread Target:      %i\n", threadTarget);
      printf("Actual Threads:     %i\n", actualThreads);      
      printf("Trades:             %i\n", MaxNumberOfTrades);
      printf("Agents:             %i\n", numberOfAgents);
      printf("Agents per Thread:  %i\n", agentsPerThread);
      printf("Trades per Thread:  %i\n", tradesPerThread);
      printf("Total Trades:       %i\n", int(totalTraded));
      printf("NUMBOUGHT:          %i\n", numBought);
      printf("NUMSOLD:            %i\n", numSold);
      printf("Average Price:      %f\n", priceAvg);
      printf("Standard Deviation: %f\n", sd);
   }
   
   // Free memory and clean up
   cudaFree(Buyers);
   cudaFree(Sellers);
   cudaFree(statesOfRandom);
   cudaDeviceReset();
   
   // We're done
   return(0);
}