///////////////////////////////////////////////////////////////////////// // Zero Intelligence Traders // Author: Dale K. Brearcliffe // Email: dbrearcl@gmu.edu // Date: 1 February 2017 // Version: 1.0 // Inspired by: Dr. Robert Axtell's C pThread version // Based on: Gode and Sunder, QJE, 1993 // Usage: ./zit #, where # is an integer for the number of threads // // Copyright 2017 Dale K. Brearcliffe // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. ///////////////////////////////////////////////////////////////////////// #include #include #include #include // CUDA's default random number generator #include #include /////////////////////////////////////////////////////////////////////// // Define some values for later use /////////////////////////////////////////////////////////////////////// #define false 0 #define true 1 // Used for debugging... #define verbose false // Used for outputing data in a single row #define dataCollect false #define buyer true #define seller false // Number of trades to take place #define MaxNumberOfTrades 100000000 // Specify the number of agents, the same for each type #define numberOfAgents 1000000 // Specify the maximum internal values for buyers and seller budgets #define maxBuyerValue 30 #define maxSellerValue 30 // Define an Agent's internal structure typedef struct TraderAgent { int buyerOrSeller; // Is this agent a buyer or seller? int quantityHeld; // How many items does the agent have? int value; // What is the value of the item? int price; // What was the purchase or sale price? } Agent; // Agent Structure ////////////////////////////////////////////////////////////////////////// // CUDA commands require error checking designed for their environment. // A macro is defined to be used as a wrapper around the CUDA command. // This is a commonly known and used macro that is found on the Internet // in several formats. // This variant was found on StackOverflow at URL: http://stackoverflow.com/ // questions/14038589/what-is-the-canonical-way-to-check-for-errors-using- // the-cuda-runtime-api ////////////////////////////////////////////////////////////////////////// #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } ////////////////////////////////////////////////////////////////////////// // GPU kernel's are sections of code that will be passed from the host // (CPU) to the device (GPU). ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// // GPU Kernel // Will seed a pseudorandom number generator for each thread so no thread // follows the same sequence. ////////////////////////////////////////////////////////////////////////// __global__ void init_curandom(unsigned int randomSeed, curandState_t* statesOfRandom) { int index = blockIdx.x * blockDim.x + threadIdx.x; curand_init(randomSeed+index, 0, 0, &statesOfRandom[index]); }; // KERNEL init_curandom() ////////////////////////////////////////////////////////////////////////// // GPU Kernel // Given a range of buyer and seller agents, select a random buyer and // random seller. If a trade is possible, then it occurs and each agent's // status is updated. // This same kernel will be run for each thread. ////////////////////////////////////////////////////////////////////////// __global__ void DoTrades (int maxTrades, Agent *Buyers, Agent *Sellers, curandState_t* statesOfRandom, int agentsPerThread, int tradesPerThread) { int index = blockIdx.x * blockDim.x + threadIdx.x; // Calculate the lower and upper buyer IDs boundaries based on index int LBB = index * agentsPerThread; int UBB = (index + 1) * agentsPerThread - 1; // Calculate the buyer's ID range, make sure it isn't zero int RBB = UBB - LBB; if (RBB==0) { RBB=1; } // Calculate the lower and upper seller IDs boundaries based on index int LSB = index * agentsPerThread; int USB = (index + 1) * agentsPerThread - 1; int RSB = USB - LSB; // Calculate the seller's ID range, make sure it isn't zero if (RSB==0) { RSB=1; } int buyerID, bidPrice, sellerID, askPrice, transactionPrice; // Try 'tradesPerThread' times to make a trade for (int i=0; i= askPrice)) { // Transaction occurs somewhere between the bid and ask price transactionPrice = askPrice + curand(&statesOfRandom[index]) % (bidPrice - askPrice + 1); // Update the agent records Buyers[buyerID].price = transactionPrice; Sellers[sellerID].price = transactionPrice; Buyers[buyerID].quantityHeld = 1; Sellers[sellerID].quantityHeld = 0; } if (verbose==true) { // printf("Index: %i Block: %i | i=%i (Buyer[%i] Bid = %i & Seller[%i] Ask = %i) ==> Trans = %i\n", index, blockIdx.x, i, buyerID, Buyers[buyerID].value, sellerID, Sellers[sellerID].value, transactionPrice); } } }; // KERNEL DoTrades() ////////////////////////////////////////////////////////////////////////// // // MAIN // ////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { // Grab the number of desired threads from the command line int threadTarget; if(argc==2) { threadTarget = atoi(argv[1]); } else if(argc>2) { printf("Excess arguments have been ignored.\n"); } else { printf("ERROR: Thread target missing.\n"); return(0); } // Display a friendly header if (dataCollect==false) { printf("\nZERO INTELLIGENCE TRADERS\n"); } // Each CUDA device has a unique set of properties // All are displayed for debugging, but only some are used in the code cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); if (verbose==true) { printf("Name: %s\n", prop.name); printf("TotalGlobalMem: %lu\n", prop.totalGlobalMem); printf("SharedMemPerBlock: %lu\n", prop.sharedMemPerBlock); printf("RegsPerBlock: %i\n", prop.regsPerBlock); printf("WarpSize: %i\n", prop.warpSize); printf("MemPitch: %lu\n", prop.memPitch); printf("MaxThreadsPerBlock: %i\n", prop.maxThreadsPerBlock); printf("MaxThreadsDim: [%i, %i, %i]\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); printf("MaxGridSize: [%i, %i, %i]\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf("TotalConstMem: %lu\n", prop.totalConstMem); printf("Major Revision: %i\n", prop.major); printf("Minor Revision: %i\n", prop.minor); printf("ClockRate: %i\n", prop.clockRate); printf("TextureAlignment: %lu\n", prop.textureAlignment); printf("DeviceOverlap: %i\n", prop.deviceOverlap); printf("MultiProcessorCount: %i\n", prop.multiProcessorCount); printf("KernelExecTimeoutEnabled: %i\n", prop.kernelExecTimeoutEnabled); printf("Integrated: %i\n", prop.integrated); printf("CanMapHostMemory: %i\n", prop.canMapHostMemory); printf("ComputeMode: %i\n", prop.computeMode); printf("ConcurrentKernels: %i\n", prop.concurrentKernels); printf("ECCEnabled: %i\n", prop.ECCEnabled); printf("pciBusID: %i\n", prop.pciBusID); printf("pciDeviceID: %i\n", prop.pciDeviceID); printf("tccDriver: %i\n", prop.tccDriver); printf("\n"); } // Initialize pseudorandom number generator with random seed // This is the standard 'C' routine used on the host (CPU) srand(time(NULL)); // Each computation block has a limited number of threads that can run // in parallel int maxThreadsPerBlock = prop.maxThreadsPerBlock; // Threads will be run in groups the size of a warp // While there can be a group run that is less than a multiple of warp, // the hardware will 'round up' to the nearest warp size int warpSize = prop.warpSize; // The number of desired threads is divided into blocks with each block // containing some number of threads (up to maxThreadsPerBlock) // Note: These are integer calculations - fractions are dropped along the way int numBlocks = (threadTarget + maxThreadsPerBlock - 1) / maxThreadsPerBlock; int numThreads = threadTarget / numBlocks / warpSize * warpSize; // A special case where the number of threads is less than the warp size // Ensures there are not zero threads if (numThreadsnumberOfAgents) { printf("ERROR! Number of threads (%i) cannot be larger than number of agents (%i)\n",actualThreads,numberOfAgents); return(1); } // Set up and populate the buyer and seller agents // A memory type unique to CUDA is used: Unified Memory // "Unified Memory creates a pool of managed memory that is shared between // the CPU and GPU, bridging the CPU-GPU divide. Managed memory is // accessible to both the CPU and GPU using a single pointer. The key is // that the system automatically migrates data allocated in Unified Memory // between host and device so that it looks like CPU memory to code running // on the CPU, and like GPU memory to code running on the GPU." // https://devblogs.nvidia.com/parallelforall/unified-memory-in-cuda-6/ Agent *Buyers, *Sellers; gpuErrchk( cudaMallocManaged(&Buyers, numberOfAgents*sizeof(Agent)) ); gpuErrchk( cudaMallocManaged(&Sellers, numberOfAgents*sizeof(Agent)) ); // Initialize agents int i; // First the buyers for (i=0; i>>(time(0), statesOfRandom); // Check for errors gpuErrchk( cudaPeekAtLastError() ); // Wait here (block) until all threads are done gpuErrchk( cudaDeviceSynchronize() ); // Capture the amount of time spent in the DoTrades kernel // Set up an event and get the start time cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); // Execute all trades on the GPU DoTrades<<>>(MaxNumberOfTrades, Buyers, Sellers, statesOfRandom, agentsPerThread, tradesPerThread); // Check for errors gpuErrchk( cudaPeekAtLastError() ); // Wait here (block) until all threads are done gpuErrchk( cudaDeviceSynchronize() ); // Get the event end time cudaEventRecord(stop); cudaEventSynchronize(stop); // Calculate the elapsed time an store in timeDoTrades float timeDoTrades = 0; cudaEventElapsedTime(&timeDoTrades, start, stop); // Compute the statistics float totalTraded = 0.0; int numBought = 0; int numSold = 0; float sum1 = 0.0; float sum2 = 0.0; // First the buyers for (i=0; i