I need to optimize this smooth function to increase speeds using methods such as
ID: 3821932 • Letter: I
Question
I need to optimize this smooth function to increase speeds using methods such as loop unrolling and blocking etc.
The smoothing function takes as input a source image src and returns the smoothed result in the destination image dst. The function avg returns the average of all the pixels around the (i,j)th pixel. Your task is to optimize smooth (and avg) to run as fast as possible. (Note: The function avg is a local function and you can get rid of it altogether to implement smooth in some other way.)
This is the original code with helper functions. THE CODE I NEED TO CHANGE IS THE SMOOTH METHOD - THE CODE I NEED TO CHANGE IS THE SMOOTH METHOD - THE CODE I NEED TO CHANGE IS THE SMOOTH METHOD - THE CODE I NEED TO CHANGE IS THE SMOOTH METHOD - THE CODE I NEED TO CHANGE IS THE SMOOTH METHOD
/* A struct used to compute averaged pixel value */
typedef struct {
int red;
int green;
int blue;
int num;
} pixel_sum;
/* Compute min and max of two integers, respectively */
static int min(int a, int b) { return (a < b ? a : b); }
static int max(int a, int b) { return (a > b ? a : b); }
/*
* initialize_pixel_sum - Initializes all fields of sum to 0
*/
static void initialize_pixel_sum(pixel_sum *sum)
{
sum->red = sum->green = sum->blue = 0;
sum->num = 0;
return;
}
static void accumulate_sum(pixel_sum *sum, pixel p)
{
sum->red += (int) p.red;
sum->green += (int) p.green;
sum->blue += (int) p.blue;
sum->num++;
return;
}
/*
* assign_sum_to_pixel - Computes averaged pixel value in current_pixel
*/
static void assign_sum_to_pixel(pixel *current_pixel, pixel_sum sum)
{
current_pixel->red = (unsigned short) (sum.red/sum.num);
current_pixel->green = (unsigned short) (sum.green/sum.num);
current_pixel->blue = (unsigned short) (sum.blue/sum.num);
return;
}
/*
* avg - Returns averaged pixel value at (i,j)
*/
static pixel avg(int dim, int i, int j, pixel *src)
{
int ii, jj;
pixel_sum sum;
pixel current_pixel;
initialize_pixel_sum(&sum);
for(ii = max(i-1, 0); ii <= min(i+1, dim-1); ii++)
for(jj = max(j-1, 0); jj <= min(j+1, dim-1); jj++)
accumulate_sum(&sum, src[RIDX(ii, jj, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
return current_pixel;
}
char smooth_descr[] = "smooth: Current working version";
void smooth(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
}
Explanation / Answer
In order to optimize this smooth function, the following code increases the memory used by a factor of 3.
By looking into this code, I understand that it's averaging all the pixels around any given pixel to produce the new value. So in most cases, you are averaging 9 values, on edges 6 values, and in corners, only 4 values.
Since there is some data re-use, cache blocking will be a better option. The biggest gains would come from collapsing all the functions into one. In that case, the blocking would be more apparent.
The following code uses a moving window to calculate series of sums. Basically, for every row compute the sum of the src plus its neighbors. For the end of the rows, that's only 2 cells, for every other element the sum consists of 3 cells.
Next we need to add the neighbors above and below us to our sum, and compute the average and assign it to the dst. Now, we don't have to compute the partial sums across the entire array before we start computing averages, we can work (in general) 3 rows at a time. The top and bottom rows only require 2 rows of sums. So, we special case the first and rows again.
This code performs well because,
Here’s the code for smooth method:
static inline void add_pixel_2(pixel_sum *sum, pixel *a, pixel *b)
{
sum->red = a->red + b->red;
sum->green = a->green + b->green;
sum->blue = a->blue + b->blue;
}
static inline void add_pixel_3(pixel_sum *sum, pixel *a, pixel *b, pixel *c)
{
sum->red = a->red + b->red + c->red;
sum->green = a->green + b->green + c->green;
sum->blue = a->blue + b->blue + c->blue;
}
static inline void do_row_sum(pixel_sum *sums, int dim, pixel *row)
{
int i;
add_pixel_2(sums++, row, row+1);
row++;
for (i=1; i<dim-1; i++) {
add_pixel_3(sums++, row-1, row, row+1);
row++;
}
add_pixel_2(sums, row-1, row);
}
static inline void avg_pixel_sums_2(pixel *dst, pixel_sum *a, pixel_sum *b, int num)
{
dst->red = (a->red + b->red)/num;
dst->green = (a->green + b->green)/num;
dst->blue = (a->blue + b->blue)/num;
}
static inline void avg_pixel_sums_3(pixel *dst, pixel_sum *a, pixel_sum *b, pixel_sum *c, int num)
{
dst->red = (a->red + b->red + c->red)/num;
dst->green = (a->green + b->green + c->green)/num;
dst->blue = (a->blue + b->blue + c->blue)/num;
}
void smooth(int dim, pixel *src, pixel *dst)
{
pixel_sum *row_sums = malloc(3*dim*sizeof(pixel_sum));
pixel_sum *r0 = row_sums;
pixel_sum *r1 = r0+dim;
pixel_sum *r2;
int i, j;
do_row_sum(r0, dim, src);
do_row_sum(r1, dim, src+dim);
// calculate row 0
avg_pixel_sums_2(dst++, r0++, r1++, 4);
for (i=1; i<dim-1; i++)
avg_pixel_sums_2(dst++, r0++, r1++, 6);
avg_pixel_sums_2(dst++, r0, r1, 4);
// calculate all the middle rows
for (i=1; i<dim-1; i++)
{
r0 = row_sums+((i-1)%3)*dim;
r1 = row_sums+(i%3)*dim;
r2 = row_sums+((i+1)%3)*dim;
do_row_sum(r2, dim, src+(i+1)*dim);
avg_pixel_sums_3(dst++, r0++, r1++, r2++, 6);
for (j=1; j<dim-1; j++)
avg_pixel_sums_3(dst++, r0++, r1++, r2++, 9);
avg_pixel_sums_3(dst++, r0, r1, r2, 6);
}
// calculate the last row
r0 = row_sums+((i-1)%3)*dim;
r1 = row_sums+(i%3)*dim;
avg_pixel_sums_2(dst++, r0++, r1++, 4);
for (i=1; i<dim-1; i++)
avg_pixel_sums_2(dst++, r0++, r1++, 6);
avg_pixel_sums_2(dst, r0, r1, 4);
}