Parallel matrix multiplication

Main considerations:

First we have to define a function that calculates one line of the resulting matrix:
template<class T>
void Multiply(T *items1, T *items2, int size, T *result)
{
  for (int i = 0; i < size; ++i) {               // go through the line items of the result
    result[i] = 0;                               // the sum of the multiplications is stored in result[i]
    for (int j = 0; j < size; ++j)               // go through the corresponding items of the input line and column
      result[i] += items1[j] * items2[j * size]; // multiply the line items of the first matrix by the column items of the second matrix
    }
}

After loading the test matrices we can easily do the calculation, first without parallelization:
    for (int i = 0; i < size; ++i)
      Multiply(items1 + i*size, items2 + i, size, result + i*size);

Then we can define our task that calculates the lines of the result matrix (the lines represent a chunk within a matrix: from the first line to the last):
template<class T>
class TMultiplierTask: public TTaskDispatcher::TTask
{
    int StartLine, // chunk first line
        EndLine,   // chunk last line
        Size;      // matrix size
    T *Items1,     // the items of the first input matrix
      *Items2,     //       -"-        second     -"-
      *Result;     //       -"-        resulting  -"-

  public:
    TMultiplierTask(int startline, int endline, int size, T *items1, T *items2, T *result): StartLine(startline), EndLine(endline), Size(size), Items1(items1), Items2(items2), Result(result) {}
    void Execute();
};
//---------------------------------------------------------------------------
template
void TMultiplierTask::Execute()
{
  for (int i = StartLine; i <= EndLine; ++i)
    Multiply(Items1 + i*Size, Items2 + i, Size, Result + i*Size); // we simply multiply the line items by the column items
}

The parallel multiplication is quite simple now:
    vector<TTaskDispatcher::TTask *> tasks;      // the task container

    for (int i = 0; i < size; i += chunk_size) { // i is the first line of the task
      int last = i + chunk_size - 1;             // calculate the last line, and trim if it exceeds the range
      if (last >= size)
        last = size - 1;

      TMultiplierTask<TItem> *t = new TMultiplierTask<TItem>(i, last, size, items1, items2, result); // create the task
      tasks.push_back(t);
      }

    TTaskDispatcher dp(threadno); // create a dispatcher in fork-join mode
    start = GetTickCount();
    dp.Execute(tasks);            // and run the tasks
    end = GetTickCount();

Test results (0 threads means the default - not parallel - case), the columns are the matrix sizes (the chunk size is 1, that is one task calculates one line of the result matrix):
Threads5001000
02196240
12346271
21243167
3782106
4321622

As we can see the performance increase is proportional to the number of processors.

Download the test program.


Attila NAGY Last updated: 2018-08-30