Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

T1 & DWT multithreading decoding optimizations #786

Merged
merged 23 commits into from
Sep 13, 2016
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
426bf8d
Move some MQC functions into a header for speed
c0nk Dec 27, 2015
c539808
opj_t1_updateflags(): tiny optimization
rouault May 21, 2016
d8fef96
Improve code generation in opj_t1_dec_clnpass()
rouault May 21, 2016
23a01df
Specialize decoding passes for 64x64 code blocks
rouault May 21, 2016
ba1edf6
Reduce number of occurrences of orient function argument
rouault May 21, 2016
31882ad
Const'ify lut arrays so they are in the read-only data section
rouault May 21, 2016
1da397e
Tier 1 decoding: add a colflags array
rouault May 22, 2016
93f7f90
opj_t1_decode_cblks(): tiny perf increase when loop unrolling
rouault May 23, 2016
956c31d
opj_t1_dec_clnpass(): remove useless test in the runlen decoding path…
rouault May 23, 2016
8371491
Better inlining of opj_t1_updateflagscolflags() w.r.t. flags_stride
rouault May 23, 2016
107eb31
Improve perf of opj_t1_dec_sigpass_mqc_vsc() and opj_t1_dec_refpass_m…
rouault May 23, 2016
7092f7e
Fix MSVC210 build issue (use of C99 declaration after statement) intr…
rouault May 23, 2016
54179fe
Add threading and thread pool API
rouault May 25, 2016
d4b7f03
Add opj_codec_set_threads() in public API and propagate resulting thr…
rouault May 25, 2016
5fbb8b2
Use thread-pool for T1 decoding
rouault May 25, 2016
57b216b
Use thread pool for DWT decoding
rouault May 25, 2016
e3eb0a2
.travis.yml: add a conf with OPJ_NUM_THREADS=2
rouault May 25, 2016
d67cd22
opj_decompress: add a -threads <num_threads> option
rouault May 25, 2016
69497d3
opj_decompress: use clock_gettime() instead of getrusage() so as to g…
rouault May 25, 2016
7d3c7a3
Be robust to failed allocations of job structures
rouault May 26, 2016
4f9abb9
[Win32] Use _beginthreadex instead of CreateThread()
rouault Aug 11, 2016
ab22c5b
opj_thread_pool: fix potential deadlock at thread pool destruction
rouault Sep 8, 2016
48c16b2
Merge branch 'master' of https://github.com/uclouvain/openjpeg into t…
rouault Sep 8, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 154 additions & 18 deletions src/lib/openjp2/dwt.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, opj_st
/**
Inverse wavelet transform in 2-D.
*/
static OPJ_BOOL opj_dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i, DWT1DFN fn);
static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i, DWT1DFN fn);

static OPJ_BOOL opj_dwt_encode_procedure( opj_tcd_tilecomp_t * tilec,
void (*p_function)(OPJ_INT32 *, OPJ_INT32,OPJ_INT32,OPJ_INT32) );
Expand Down Expand Up @@ -473,8 +473,8 @@ OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec)
/* <summary> */
/* Inverse 5-3 wavelet transform in 2-D. */
/* </summary> */
OPJ_BOOL opj_dwt_decode(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) {
return opj_dwt_decode_tile(tilec, numres, &opj_dwt_decode_1);
OPJ_BOOL opj_dwt_decode(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) {
return opj_dwt_decode_tile(tp, tilec, numres, &opj_dwt_decode_1);
}


Expand Down Expand Up @@ -556,10 +556,72 @@ static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* restrict r, OPJ_U
return mr ;
}

typedef struct
{
opj_dwt_t h;
DWT1DFN dwt_1D;
OPJ_UINT32 rw;
OPJ_UINT32 w;
OPJ_INT32 * restrict tiledp;
int min_j;
int max_j;
} opj_dwd_decode_h_job_t;

static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
{
int j;
opj_dwd_decode_h_job_t* job;
(void)tls;

job = (opj_dwd_decode_h_job_t*)user_data;
for( j = job->min_j; j < job->max_j; j++ )
{
opj_dwt_interleave_h(&job->h, &job->tiledp[j*job->w]);
(job->dwt_1D)(&job->h);
memcpy(&job->tiledp[j*job->w], job->h.mem, job->rw * sizeof(OPJ_INT32));
}

opj_aligned_free(job->h.mem);
opj_free(job);
}

typedef struct
{
opj_dwt_t v;
DWT1DFN dwt_1D;
OPJ_UINT32 rh;
OPJ_UINT32 w;
OPJ_INT32 * restrict tiledp;
int min_j;
int max_j;
} opj_dwd_decode_v_job_t;

static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
{
int j;
opj_dwd_decode_v_job_t* job;
(void)tls;

job = (opj_dwd_decode_v_job_t*)user_data;
for( j = job->min_j; j < job->max_j; j++ )
{
OPJ_UINT32 k;
opj_dwt_interleave_v(&job->v, &job->tiledp[j], (OPJ_INT32)job->w);
(job->dwt_1D)(&job->v);
for(k = 0; k < job->rh; ++k) {
job->tiledp[k * job->w + j] = job->v.mem[k];
}
}

opj_aligned_free(job->v.mem);
opj_free(job);
}


/* <summary> */
/* Inverse wavelet transform in 2-D. */
/* </summary> */
static OPJ_BOOL opj_dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres, DWT1DFN dwt_1D) {
static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres, DWT1DFN dwt_1D) {
opj_dwt_t h;
opj_dwt_t v;

Expand All @@ -569,11 +631,15 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres
OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - tr->y0); /* height of the resolution level computed */

OPJ_UINT32 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
size_t h_mem_size;
int num_threads;

if (numres == 1U) {
return OPJ_TRUE;
}
h.mem = (OPJ_INT32*)opj_aligned_malloc(opj_dwt_max_resolution(tr, numres) * sizeof(OPJ_INT32));
num_threads = opj_thread_pool_get_thread_count(tp);
h_mem_size = opj_dwt_max_resolution(tr, numres) * sizeof(OPJ_INT32);
h.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size);
if (! h.mem){
/* FIXME event manager error callback */
return OPJ_FALSE;
Expand All @@ -595,23 +661,93 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres
h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
h.cas = tr->x0 % 2;

for(j = 0; j < rh; ++j) {
opj_dwt_interleave_h(&h, &tiledp[j*w]);
(dwt_1D)(&h);
memcpy(&tiledp[j*w], h.mem, rw * sizeof(OPJ_INT32));
}
if( num_threads <= 1 || rh == 1 )
{
for(j = 0; j < rh; ++j) {
opj_dwt_interleave_h(&h, &tiledp[j*w]);
(dwt_1D)(&h);
memcpy(&tiledp[j*w], h.mem, rw * sizeof(OPJ_INT32));
}
}
else
{
int num_jobs = num_threads;
if( rh < num_jobs )
num_jobs = rh;
for( j = 0; j < num_jobs; j++ )
{
opj_dwd_decode_h_job_t* job;

job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

allocation shall be checked for failure.

job->h = h;
job->dwt_1D = dwt_1D;
job->rw = rw;
job->w = w;
job->tiledp = tiledp;
job->min_j = j * (rh / num_jobs);
job->max_j = (j+1) * (rh / num_jobs);
if( job->max_j > rh || j == num_jobs - 1 )
job->max_j = rh;
job->h.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size);
if (!job->h.mem)
{
/* FIXME event manager error callback */
opj_thread_pool_wait_completion(tp, 0);
opj_free(job);
opj_aligned_free(h.mem);
return OPJ_FALSE;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the jobs are submitted outside of this loop (i.e. in another one following), then it would be possible to fallback to single-thread in case of allocation error (& changing single-thread condition below the loop). It's very likely that if hitting an out of memory condition here, one will be raised later so it's quite arguable wether to do this or not.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok, I now just got what you meant. Well, in modern OS, if malloc failures for such small structures happen you are in big trouble (swap trashing, etc...), so a clean error exit is probably good enough than a smarter fallback strategy

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I said, it was quite arguable. A clean error is probably good enough.

}
opj_thread_pool_submit_job( tp, opj_dwt_decode_h_func, job );
}
opj_thread_pool_wait_completion(tp, 0);
}

v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
v.cas = tr->y0 % 2;

for(j = 0; j < rw; ++j){
OPJ_UINT32 k;
opj_dwt_interleave_v(&v, &tiledp[j], (OPJ_INT32)w);
(dwt_1D)(&v);
for(k = 0; k < rh; ++k) {
tiledp[k * w + j] = v.mem[k];
}
}
if( num_threads <= 1 || rw == 1 )
{
for(j = 0; j < rw; ++j){
OPJ_UINT32 k;
opj_dwt_interleave_v(&v, &tiledp[j], (OPJ_INT32)w);
(dwt_1D)(&v);
for(k = 0; k < rh; ++k) {
tiledp[k * w + j] = v.mem[k];
}
}
}
else
{
int num_jobs = num_threads;
if( rw < num_jobs )
num_jobs = rw;
for( j = 0; j < num_jobs; j++ )
{
opj_dwd_decode_v_job_t* job;

job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing allocation check.

job->v = v;
job->dwt_1D = dwt_1D;
job->rh = rh;
job->w = w;
job->tiledp = tiledp;
job->min_j = j * (rw / num_jobs);
job->max_j = (j+1) * (rw / num_jobs);
if( job->max_j > rw || j == num_jobs - 1 )
job->max_j = rw;
job->v.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size);
if (!job->v.mem)
{
/* FIXME event manager error callback */
opj_thread_pool_wait_completion(tp, 0);
opj_free(job);
opj_aligned_free(v.mem);
return OPJ_FALSE;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}
opj_thread_pool_submit_job( tp, opj_dwt_decode_v_func, job );
}
opj_thread_pool_wait_completion(tp, 0);
}
}
opj_aligned_free(h.mem);
return OPJ_TRUE;
Expand Down
3 changes: 2 additions & 1 deletion src/lib/openjp2/dwt.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,11 @@ OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec);
/**
Inverse 5-3 wavelet transform in 2-D.
Apply a reversible inverse DWT transform to a component of an image.
@param tp Thread pool
@param tilec Tile component information (current tile)
@param numres Number of resolution levels to decode
*/
OPJ_BOOL opj_dwt_decode(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres);
OPJ_BOOL opj_dwt_decode(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres);

/**
Get the gain of a subband for the reversible 5-3 DWT.
Expand Down
2 changes: 1 addition & 1 deletion src/lib/openjp2/tcd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1610,7 +1610,7 @@ static OPJ_BOOL opj_tcd_dwt_decode ( opj_tcd_t *p_tcd )
*/

if (l_tccp->qmfbid == 1) {
if (! opj_dwt_decode(l_tile_comp, l_img_comp->resno_decoded+1)) {
if (! opj_dwt_decode(p_tcd->thread_pool, l_tile_comp, l_img_comp->resno_decoded+1)) {
return OPJ_FALSE;
}
}
Expand Down