Matrix/Perkalian

Dari PaloDozen

Perkalian matriks adalah operasi yang sangat sering diperlukan pada berbagai masalah komputasi. Perkalian ini dapat dipercepat dengan menggunakan algoritma paralel.


Daftar isi

Algoritma Perkalian Matriks

Perkalian dua buah matriks A dan B akan menghasilkan satu matriks C baru dimana setiap elemen matriks C dihitung sebagai


C_{ij} = \sum_{k=1}^n (A_{ik} * B_{kj})

Perkalian matriks Sekuensial

Perkalian matriks secara sekuensial memerlukan tiga kalang (loop) sebagai berikut:

/******************************************************************************
* Matrix Multiplication 
* Eko M. Budi
******************************************************************************/
 
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/time.h>
#include <pthread.h>
#include <time.h>
 
#define NDIM 1024                             /* number of NDIMs */
double          a[NDIM][NDIM];
double          b[NDIM][NDIM];
double          c[NDIM][NDIM];
struct timeval tvstart, tvstop;
 
int main(int argc, char *argv[])
{
   int             i, j, k;
   double          sum;
 
   // fill the matriks
   for (i = 0; i < NDIM; i++) {
     for (j = 0; j < NDIM; j++) {
        a[i][j] = i + j;
	b[i][j] = i + j;
     }
   }
 
   // start the bechmark timer
   gettimeofday(&tvstart, NULL);
   for (i=0; i<NDIM; i++) {
      for (j=0; j<NDIM; j++) {
         sum=0;
         for (k=0; k<NDIM; k++) {
             sum = sum + a[i][k] * b[k][j];
         }
         c[i][j] = sum;
      }
   }
 
   gettimeofday(&tvstop, NULL);
 
   double t = (tvstop.tv_sec - tvstart.tv_sec) * 1000 + 
               (tvstop.tv_usec - tvstart.tv_usec) * 1e-3;
 
   printf("Sequential Dim: %d : Time: %f\n", n, NDIM, t);
   return 0;
}


Perkalian Matriks PThread

Program matrix.c berikut menggunakan pustaka pthread untuk menghitung matrix secara paralel di komputer multicore.

/******************************************************************************
* Matrix Multiplication Program using pthread
* Eko M. Budi
******************************************************************************/
 
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/time.h>
#include <pthread.h>
#include <time.h>
 
#define MAX_THREAD 16                         /* maximum number of threads */
#define NDIM 1024                             /* number of NDIMs */
double          a[NDIM][NDIM];
double          b[NDIM][NDIM];
double          c[NDIM][NDIM];
struct timeval tvstart, tvstop;
 
typedef struct
{
    int             id;                       /* identification */
    int             noproc;                   /* process */
    int             dim;                      /* number of threads */
    double	(*a)[NDIM][NDIM],(*b)[NDIM][NDIM],(*c)[NDIM][NDIM];
} parm;
 
void mm(int me_no, int noproc, int n, double a[NDIM][NDIM], double b[NDIM][NDIM], double c[NDIM][NDIM])
{
    int i,j,k;
    double sum;
    i=me_no;
    while (i<n) {
       for (j = 0; j < n; j++) {
          sum = 0.0;
	  for (k = 0; k < n; k++) {
	     sum = sum + a[i][k] * b[k][j];
	  }
	  c[i][j] = sum;
       }
       i+=noproc;
    }
}
 
void * worker(void *arg)
{
     parm           *p = (parm *) arg;
     mm(p->id, p->noproc, p->dim, *(p->a), *(p->b), *(p->c));
}
 
 
int main(int argc, char *argv[])
{
   int             j, k, noproc, me_no;
   double          sum;
   double          t1, t2;
   pthread_t      *threads;
   parm           *arg;
   int             n, i;
   for (i = 0; i < NDIM; i++)
     for (j = 0; j < NDIM; j++) {
        a[i][j] = i + j;
	b[i][j] = i + j;
     }
   if (argc < 2) {
     n = MAX_THREAD;
   }
   else {
      n = atoi(argv[1]);
   }
 
   // start the bechmark timer
   gettimeofday(&tvstart, NULL);
   threads = (pthread_t *) malloc(n * sizeof(pthread_t));
   arg=(parm *)malloc(sizeof(parm)*n);
 
   for (i = 0; i < n; i++) {
	arg[i].id = i;
	arg[i].noproc = n;
	arg[i].dim = NDIM;
	arg[i].a = &a;
	arg[i].b = &b;
	arg[i].c = &c;
	pthread_create(&threads[i], NULL, worker, (void *)(arg+i));
    }
 
    for (i = 0; i < n; i++) {
	pthread_join(threads[i], NULL);
    }
 
    gettimeofday(&tvstop, NULL);
 
    double t = (tvstop.tv_sec - tvstart.tv_sec) * 1000 + 
               (tvstop.tv_usec - tvstart.tv_usec) * 1e-3;
 
    printf("Pthread: %d : Dim: %d : Time: %f\n", n, NDIM, t);
 
    free(arg);
    return 0;
}

Untuk mengkompilasi program ini, buat Makefile berikut

GCC=gcc -pthread
all : matrix 
matrix : matrix.c
	$(GCC) -o matrix matrix.c

Buat juga skrip run.sh berikut untuk mencoba dengan berbagai jumlah thread.

#!/bin/sh
for NT in 8 4 2 1; do
   ./matrix $NT   
   sleep 100
done

Setelah semuanya siap, kompilasi dan jalankan dengan perintah

make
sh run.sh

Matriks MPI

/******************************************************************************
 
* Matrix Multiplication Program using MPI
* Eko Mursito Budi
* Modified from http://heshans.blogspot.com/2009/05/matrix-multiplication-using-mpi.html
*  
******************************************************************************/
 
 
 
#include <stdio.h>
#include "mpi.h"
 
#define MASTER 0		/* taskid of first task */
#define FROM_MASTER 1		/* setting a message type */
#define FROM_WORKER 2		/* setting a message type */
 
#define NDIM 1024
MPI_Status status;
 
double a[NDIM][NDIM], 		/* matrix A to be multiplied */
       b[NDIM][NDIM],      	/* matrix B to be multiplied */
       c[NDIM][NDIM];	    	/* result matrix C */
 
main(int argc, char **argv) 
{
 
    int numtasks,	        /* number of tasks in partition */
    taskid,		    	/* a task identifier */
    numworkers,			/* number of worker tasks */
    source,		    	/* task id of message source */
    dest,		     	/* task id of message destination */
    nbytes,		      	/* number of bytes in message */
    mtype,		       	/* message type */
    intsize,			/* size of an integer in bytes */
    dbsize,		    	/* size of a double float in bytes */
    rows,                      	/* rows of matrix A sent to each worker */
    averow, extra, offset,      /* used to determine rows sent to each worker */
    i, j, k,			/* misc */
    count;
 
    struct timeval start, stop;
    intsize = sizeof(int);
    dbsize = sizeof(double);
 
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
    MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
 
    // calculate the number of workers 
    numworkers = numtasks-1;
 
 
    /*---------------------------- master ----------------------------*/
 
if (taskid == MASTER) {
 
  printf("Number of worker tasks = %d\n",numworkers);
 
  for (i=0; i<NRA; i++)
 
    for (j=0; j<NCA; j++)
 
      a[i][j]= i+j;
 
  for (i=0; i<NCA; i++)
 
    for (j=0; j<NCB; j++)
 
      b[i][j]= i*j;
 
 
 
gettimeofday(&start, 0);
 
 
 
  /* send matrix data to the worker tasks */
 
  averow = NRA/numworkers;
 
  extra = NRA%numworkers;
 
  offset = 0;
 
  mtype = FROM_MASTER;
 
  for (dest=1; dest<=numworkers; dest++) {			
 
    rows = (dest <= extra) ? averow+1 : averow;   	
 
    //printf("   Sending %d rows to task %d\n",rows,dest);
 
    MPI_Send(&offset, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
 
    MPI_Send(&rows, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
 
    count = rows*NCA;
 
    MPI_Send(&a[offset][0], count, MPI_DOUBLE, dest, mtype, MPI_COMM_WORLD);
 
    count = NCA*NCB;
 
    MPI_Send(&b, count, MPI_DOUBLE, dest, mtype, MPI_COMM_WORLD);
 
 
 
    offset = offset + rows;
 
    }
 
 
 
  /* wait for results from all worker tasks */
 
  mtype = FROM_WORKER;
 
  for (i=1; i<=numworkers; i++)	{			
 
    source = i;
 
    MPI_Recv(&offset, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);
 
    MPI_Recv(&rows, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);
 
    count = rows*NCB;
 
    MPI_Recv(&c[offset][0], count, MPI_DOUBLE, source, mtype, MPI_COMM_WORLD, 
 
               &status);
 
 
 
    }
 
 
 
  #ifdef PRINT
 
  printf("Here is the result matrix\n");
 
  for (i=0; i<NRA; i++) { 
 
    printf("\n"); 
 
    for (j=0; j<NCB; j++) 
 
      printf("%6.2f   ", c[i][j]);
 
    }
 
  printf ("\n");
 
  #endif
 
  gettimeofday(&stop, 0);
  fprintf(stdout,"Time = %.6f\n\n",
         (stop.tv_sec+stop.tv_usec*1e-6)-(start.tv_sec+start.tv_usec*1e-6));
 
  }  /* end of master section */
 
/*---------------------------- worker (slave)----------------------------*/
if (taskid > MASTER) {
  mtype = FROM_MASTER;
  source = MASTER;
 
  #ifdef PRINT
 
  printf ("Master =%d, mtype=%d\n", source, mtype);
 
  #endif
 
  MPI_Recv(&offset, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);
 
  #ifdef PRINT 
 
  printf ("offset =%d\n", offset);
 
  #endif
 
  MPI_Recv(&rows, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);
 
  #ifdef PRINT
 
  printf ("row =%d\n", rows);
 
  #endif
 
  count = rows*NCA;
 
  MPI_Recv(&a, count, MPI_DOUBLE, source, mtype, MPI_COMM_WORLD, &status);
 
  #ifdef PRINT
 
  printf ("a[0][0] =%e\n", a[0][0]);
 
  #endif
 
  count = NCA*NCB;
 
  MPI_Recv(&b, count, MPI_DOUBLE, source, mtype, MPI_COMM_WORLD, &status);
 
  #ifdef PRINT
 
  printf ("b=\n");
 
  #endif
 
  for (k=0; k<NCB; k++)
 
    for (i=0; i<rows; i++) {
 
      c[i][k] = 0.0;
 
      for (j=0; j<NCA; j++)
 
        c[i][k] = c[i][k] + a[i][j] * b[j][k];
 
      }
 
 
 
  //mtype = FROM_WORKER;
 
  #ifdef PRINT
 
  printf ("after computer\n");
 
  #endif
 
  //MPI_Send(&offset, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD);
 
  MPI_Send(&offset, 1, MPI_INT, MASTER, FROM_WORKER, MPI_COMM_WORLD);
 
  //MPI_Send(&rows, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD);
 
  MPI_Send(&rows, 1, MPI_INT, MASTER, FROM_WORKER, MPI_COMM_WORLD);
 
  //MPI_Send(&c, rows*NCB, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD);
 
  MPI_Send(&c, rows*NCB, MPI_DOUBLE, MASTER, FROM_WORKER, MPI_COMM_WORLD);
 
  #ifdef PRINT
 
  printf ("after send\n");
 
  #endif
 
  }  /* end of worker */ 
 
  MPI_Finalize();
} /* of main */

Matriks CUDA



Rujukan


Kontributor: Mursito