영상처리

Sobel Code (SIMD)

park__ 2024. 12. 9. 15:47

Sobel SIMD로 구현된 Code

 

속도 비교

 

Code 경로 : https://github.com/m3y54m/sobel-simd-opencv/tree/master

 

// Refactored for build in Ubuntu

#include <opencv2/opencv.hpp>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// Precise Time Measurement
#if defined(_WIN32) || defined(__CYGWIN__)
// Windows (x86 or x64)
#include <windows.h>
#elif defined(__linux__)
// Linux
#include <time.h>
#else
#error Unknown environment!
#endif

// Intel x86 SIMD Intrinsics
#if defined(_MSC_VER)
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
/* GCC-compatible compiler, targeting x86/x86-64 */
#include <x86intrin.h>
#elif defined(__GNUC__) && defined(__ARM_NEON__)
/* GCC-compatible compiler, targeting ARM with NEON */
#include <arm_neon.h>
#elif defined(__GNUC__) && defined(__IWMMXT__)
/* GCC-compatible compiler, targeting ARM with WMMX */
#include <mmintrin.h>
#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
#include <altivec.h>
#elif defined(__GNUC__) && defined(__SPE__)
/* GCC-compatible compiler, targeting PowerPC with SPE */
#include <spe.h>
#endif

using namespace cv;
using namespace std;

unsigned long calculateElapsedTime(unsigned long start, unsigned long stop);
unsigned long getCurrentTimeInMicroseconds();
void SobelNonSimd(Rect cropSize, Mat inputImage);
void SobelSimd(Rect cropSize, Mat inputImage);
void SobelOpenCV(Rect cropSize, Mat inputImage);

// precise time measurement
unsigned long t1, t2, elapsedTime;

// OpenCV image datatypes
Mat originalImage;
Mat resultImageNonSimd;
Mat resultImageSimd;
Mat resultImageOpenCV;

uchar *outputPointer;
uchar *inputPointer;

int width;
int height;

// crop image and remove added borders
const int border = 8;

int main(int argc, char **argv)
{
    if (border < 1)
    {
        //cout << "border must be greater than or equal to 1" << endl;
        //cout << "Press any key to exit!" << endl;
        //getchar();
        //return 0;
    }

    // load input image
    originalImage = imread("monarch.jpg", IMREAD_GRAYSCALE);

    originalImage.convertTo(originalImage, CV_8U, 255.0 / 65535.0);

    int imageWidth = originalImage.cols;
    int imageHeight = originalImage.rows;

    Rect cropped(border, border, imageWidth, imageHeight);

    // show input image
    // namedWindow("Original", WINDOW_AUTOSIZE);
    // imshow("Original", originalImage);
    // add border to image using replication method
    copyMakeBorder(originalImage, originalImage, border, border, border, border, BORDER_REPLICATE);

    /******************* Non-SIMD ********************/

    SobelNonSimd(cropped, originalImage);

    // namedWindow("Sobel-Non-SIMD", WINDOW_AUTOSIZE);
    // imshow("Sobel-Non-SIMD", resultImageNonSimd);

    /********************** SIMD *********************/

    SobelSimd(cropped, originalImage);

    // namedWindow("Sobel-SIMD", WINDOW_AUTOSIZE);
    // imshow("Sobel-SIMD", resultImageSimd);

    /************* OpenCV Built-in Sobel *************/

    SobelOpenCV(cropped, originalImage);

    // namedWindow("Sobel-OpenCV", WINDOW_AUTOSIZE);
    // imshow("Sobel-OpenCV", resultImageOpenCV);

    // waitKey(); // Wait for a keystroke in the window
    // getchar();
    return 0;
}

unsigned long calculateElapsedTime(unsigned long start, unsigned long stop)
{
    return stop - start;
}

unsigned long getCurrentTimeInMicroseconds()
{
#if defined(_WIN32) || defined(__CYGWIN__)
    // Windows (x86 or x64)
    LARGE_INTEGER freq;
    LARGE_INTEGER t;
    QueryPerformanceFrequency(&freq);
    QueryPerformanceCounter(&t);
    return (unsigned long)((1.0e6 * t.QuadPart) / freq.QuadPart);
#elif defined(__linux__)
    // Linux
    struct timespec t;
    clock_gettime(CLOCK_MONOTONIC, &t);
    return (unsigned long)(1.0e6 * t.tv_sec + 1.0e-3 * t.tv_nsec);
#else
    return 0;
#error Unknown environment!
#endif
}

void SobelNonSimd(Rect cropSize, Mat inputImage)
{
    Mat outputImage;

    // allocate new array data for outputImage with the same size as inputImage.
    outputImage.create(inputImage.size(), inputImage.depth());

    uchar m1, m2, m3, m4, m5, m6, m7, m8, m9;
    int Gx, Gy;

    outputPointer = outputImage.ptr<uchar>();
    inputPointer = inputImage.ptr<uchar>();

    width = inputImage.cols;
    height = inputImage.rows;

    // start timer
    t1 = getCurrentTimeInMicroseconds();

    for (int i = (border - 1); i < height - (border + 1); i++) // rows
    {
        for (int j = (border - 1); j < width - (border + 1); j++) // cols
        {

            /*
            Sobel operator input matrix
            +~~~~~~~~~~~~~~+
            | m1 | m2 | m3 |
            |~~~~+~~~~+~~~~+
            | m4 | m5 | m6 |
            |~~~~+~~~~+~~~~+
            | m7 | m8 | m9 |
            +~~~~+~~~~+~~~~+
            */

            m1 = *(inputPointer + i * width + j);
            m2 = *(inputPointer + i * width + j + 1);
            m3 = *(inputPointer + i * width + j + 2);

            m4 = *(inputPointer + (i + 1) * width + j);
            m5 = *(inputPointer + (i + 1) * width + j + 1);
            m6 = *(inputPointer + (i + 1) * width + j + 2);

            m7 = *(inputPointer + (i + 2) * width + j);
            m8 = *(inputPointer + (i + 2) * width + j + 1);
            m9 = *(inputPointer + (i + 2) * width + j + 2);

            // Calculating Gx
            Gx = (m3 + 2 * m6 + m9) - (m1 + 2 * m4 + m7);

            // Calculating Gy
            Gy = (m1 + 2 * m2 + m3) - (m7 + 2 * m8 + m9);

            outputPointer[(i + 1) * width + j + 1] = saturate_cast<uchar>(abs(Gx) + abs(Gy)); // approximate
        }
    }

    // stop timer
    t2 = getCurrentTimeInMicroseconds();
    // calculate and print elapsed time in microseconds
    elapsedTime = calculateElapsedTime(t1, t2);
    cout << "Execution time for non-SIMD Sobel edge detection:" << endl;
    cout << elapsedTime << " us" << endl;

    outputImage = outputImage(cropSize);

    // Copy outputImage to resultImageNonSimd
    resultImageNonSimd = outputImage.clone();
}

void SobelSimd(Rect cropSize, Mat inputImage)
{
    Mat outputImage;

    // inputImage = imread("boat.png", IMREAD_GRAYSCALE);
    outputImage.create(inputImage.size(), inputImage.depth());

    __m128i p1, p2, p3, p4, p5, p6, p7, p8, p9;
    __m128i gx, gy, temp, G;

    outputPointer = outputImage.ptr<uchar>();
    inputPointer = inputImage.ptr<uchar>();

    width = inputImage.cols;
    height = inputImage.rows;

    // start timer
    t1 = getCurrentTimeInMicroseconds();

    for (int i = (border - 1); i < height - (border + 1); i += 1)
    {
        for (int j = (border - 1); j < width - (2 * border - 1); j += 8)
        {

            /*
            Sobel operator input matrix
            +~~~~~~~~~~~~~~+
            | p1 | p2 | p3 |
            |~~~~+~~~~+~~~~+
            | p4 | p5 | p6 |
            |~~~~+~~~~+~~~~+
            | p7 | p8 | p9 |
            +~~~~+~~~~+~~~~+
            */

            /*
            __m128i _mm_loadu_si128 (__m128i const* mem_addr)
            Load 128-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.
            */

            p1 = _mm_loadu_si128((__m128i *)(inputPointer + i * width + j));
            p2 = _mm_loadu_si128((__m128i *)(inputPointer + i * width + j + 1));
            p3 = _mm_loadu_si128((__m128i *)(inputPointer + i * width + j + 2));

            p4 = _mm_loadu_si128((__m128i *)(inputPointer + (i + 1) * width + j));
            p5 = _mm_loadu_si128((__m128i *)(inputPointer + (i + 1) * width + j + 1));
            p6 = _mm_loadu_si128((__m128i *)(inputPointer + (i + 1) * width + j + 2));

            p7 = _mm_loadu_si128((__m128i *)(inputPointer + (i + 2) * width + j));
            p8 = _mm_loadu_si128((__m128i *)(inputPointer + (i + 2) * width + j + 1));
            p9 = _mm_loadu_si128((__m128i *)(inputPointer + (i + 2) * width + j + 2));

            /*
            __m128i _mm_srli_epi16 (__m128i a, int imm8)
            Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the Gs in dst.

            __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b)
            Unpack and interleave 8-bit integers from the low half of a and b, and store the Gs in dst.
            */

            // convert image 8-bit unsigned integer data to 16-bit signed integers to use in arithmetic operations
            p1 = _mm_srli_epi16(_mm_unpacklo_epi8(p1, p1), 8);
            p2 = _mm_srli_epi16(_mm_unpacklo_epi8(p2, p2), 8);
            p3 = _mm_srli_epi16(_mm_unpacklo_epi8(p3, p3), 8);
            p4 = _mm_srli_epi16(_mm_unpacklo_epi8(p4, p4), 8);
            p5 = _mm_srli_epi16(_mm_unpacklo_epi8(p5, p5), 8);
            p6 = _mm_srli_epi16(_mm_unpacklo_epi8(p6, p6), 8);
            p7 = _mm_srli_epi16(_mm_unpacklo_epi8(p7, p7), 8);
            p8 = _mm_srli_epi16(_mm_unpacklo_epi8(p8, p8), 8);
            p9 = _mm_srli_epi16(_mm_unpacklo_epi8(p9, p9), 8);

            /*
            __m128i _mm_add_epi16 (__m128i a, __m128i b)
            Add packed 16-bit integers in a and b, and store the Gs in dst.

            __m128i _mm_sub_epi16 (__m128i a, __m128i b)
            Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the Gs in dst.
            */

            // Calculating Gx = (p3 + 2 * p6 + p9) - (p1 + 2 * p4 + p7)
            gx = _mm_add_epi16(p6, p6);   // 2*p6
            gx = _mm_add_epi16(gx, p3);   // p3 + 2*p6
            gx = _mm_add_epi16(gx, p9);   // p3 + 2*p6 + p9
            gx = _mm_sub_epi16(gx, p1);   // p3 + 2*p6 + p9 - p1
            temp = _mm_add_epi16(p4, p4); // 2*p4
            gx = _mm_sub_epi16(gx, temp); // p3 + 2*p6 + p9 - (p1 + 2*p4)
            gx = _mm_sub_epi16(gx, p7);   // p3 + 2*p6 + p9 - (p1 + 2*p4 + p7)

            // Calculating Gy = (p1 + 2 * p2 + p3) - (p7 + 2 * p8 + p9)
            gy = _mm_add_epi16(p2, p2);   // 2*p2
            gy = _mm_add_epi16(gy, p1);   // p1 + 2*p2
            gy = _mm_add_epi16(gy, p3);   // p1 + 2*p2 + p3
            gy = _mm_sub_epi16(gy, p7);   // p1 + 2*p2 + p3 - p7
            temp = _mm_add_epi16(p8, p8); // 2*p8
            gy = _mm_sub_epi16(gy, temp); // p1 + 2*p2 + p3 - (p7 + 2*p8)
            gy = _mm_sub_epi16(gy, p9);   // p1 + 2*p2 + p3 - (p7 + 2*p8 + p9)

            /*
            __m128i _mm_abs_epi16 (__m128i a)
            Compute the absolute value of packed 16-bit integers in a, and store the unsigned Gs in dst.
            */

            gx = _mm_abs_epi16(gx); // |Gx|
            gy = _mm_abs_epi16(gy); // |Gy|

            // G = |Gx| + |Gy|
            G = _mm_add_epi16(gx, gy);

            /*
            __m128i _mm_packus_epi16 (__m128i a, __m128i b)
            Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.
            */
            G = _mm_packus_epi16(G, G);

            /*
            void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)
            Store 128-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.
            */
            _mm_storeu_si128((__m128i *)(outputPointer + (i + 1) * width + j + 1), G);
        }
    }

    // stop timer
    t2 = getCurrentTimeInMicroseconds();
    // calculate and print elapsed time in microseconds
    elapsedTime = calculateElapsedTime(t1, t2);
    cout << "Execution time for SIMD Sobel edge detection:" << endl;
    cout << elapsedTime << " us" << endl;

    // crop image and remove added borders
    outputImage = outputImage(cropSize);

    // Copy outputImage to resultImageSimd
    resultImageSimd = outputImage.clone();
}

void SobelOpenCV(Rect cropSize, Mat inputImage)
{
    Mat outputImage;

    /// Generate grad_x and grad_y
    Mat grad_x, grad_y;
    Mat abs_grad_x, abs_grad_y;
    int scale = 1;
    int delta = 0;
    int ddepth = CV_8U;

    inputImage = inputImage(cropSize);

    // start timer
    t1 = getCurrentTimeInMicroseconds();

    // Gradient X
    Sobel(inputImage, grad_x, ddepth, 1, 0, 3, scale, delta, BORDER_DEFAULT);
    convertScaleAbs(grad_x, abs_grad_x, 1, 0);

    // Gradient Y
    Sobel(inputImage, grad_y, ddepth, 0, 1, 3, scale, delta, BORDER_DEFAULT);
    convertScaleAbs(grad_y, abs_grad_y, 1, 0);

    /// Total Gradient (approximate)
    addWeighted(abs_grad_x, 2.0, abs_grad_y, 2.0, 0, outputImage);

    // stop timer
    t2 = getCurrentTimeInMicroseconds();
    // calculate and print elapsed time in microseconds
    elapsedTime = calculateElapsedTime(t1, t2);
    cout << "Execution time for OpenCV Sobel edge detection:" << endl;
    cout << elapsedTime << " us" << endl;

    // Copy outputImage to resultImageOpenCV
    resultImageOpenCV = outputImage.clone();
}

 

'영상처리' 카테고리의 다른 글

Auto Threshold (Li) C++  (1) 2024.12.10
Auto Threshold (Huang) C++  (0) 2024.12.10
OpenCV cv::saturate_cast  (0) 2024.12.09
<C++> Saliency Detection  (0) 2023.09.20
Watersheds 알고리즘  (0) 2023.09.04