Less blocking
This commit is contained in:
parent
1209fd27ec
commit
3a79b7072e
201
src/main.c
201
src/main.c
|
@ -210,14 +210,10 @@ static void UniqueInBlock(const size_t a_in, const size_t b_in,
|
|||
ResetConditionalPermutations(b_out_perms, b_in);
|
||||
while (!b_out_perms[b_in - 1].exhausted) {
|
||||
// Compare the two rows
|
||||
// TODO: Figure out why both A==B and B==A need to be checked.
|
||||
// Of note is that the pathological case I found occurs under a
|
||||
// party swap.
|
||||
// See also: the same problem in UniqueInSubsetPair
|
||||
|
||||
FromCgToP(a_out, b_out, a_in, b_in, rhs, p_buf,
|
||||
a_in_perm->permutation, b_in_perm->permutation,
|
||||
a_out_perms, b_out_perms);
|
||||
a_in_perm->permutation, b_in_perm->permutation,
|
||||
a_out_perms, b_out_perms);
|
||||
|
||||
{
|
||||
FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
|
||||
|
@ -348,14 +344,10 @@ static void UniqueInSubsetPair(
|
|||
while (!a_out_perms[a_in - 1].exhausted) {
|
||||
ResetConditionalPermutations(b_out_perms, b_in);
|
||||
while (!b_out_perms[b_in - 1].exhausted) {
|
||||
// TODO: Figure out why both A==B and B==A need to be checked.
|
||||
// Of note is that the pathological case I found occurs under a
|
||||
// party swap.
|
||||
|
||||
// Compare the two rows
|
||||
FromCgToP(a_out, b_out, a_in, b_in, rhs, p_buf,
|
||||
a_in_perm->permutation, b_in_perm->permutation,
|
||||
a_out_perms, b_out_perms);
|
||||
a_in_perm->permutation, b_in_perm->permutation,
|
||||
a_out_perms, b_out_perms);
|
||||
|
||||
{
|
||||
FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
|
||||
|
@ -448,8 +440,6 @@ int main(int argc, char *argv[]) {
|
|||
size_t *unique = NULL;
|
||||
size_t *ends =
|
||||
NULL; // Bin i contains elements with indices ends[i-1]..ends[i]
|
||||
size_t *unique_swap = NULL;
|
||||
size_t *ends_swap = NULL;
|
||||
|
||||
// TODO: Optimization: only one lock is needed
|
||||
pthread_rwlockattr_t attr;
|
||||
|
@ -457,12 +447,10 @@ int main(int argc, char *argv[]) {
|
|||
pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
|
||||
pthread_rwlock_t vector_lock, vector_swap_lock;
|
||||
pthread_rwlock_init(&vector_lock, &attr);
|
||||
pthread_rwlock_init(&vector_swap_lock, &attr);
|
||||
|
||||
#pragma omp parallel default(none) \
|
||||
shared(stderr, args, a_out, b_out, a_in, b_in, row_len, matrix, row_count, \
|
||||
bin_size, bin_count, unique, ends, unique_swap, ends_swap, \
|
||||
vector_lock, vector_swap_lock)
|
||||
bin_size, bin_count, unique, ends, vector_lock)
|
||||
{
|
||||
_Bool *seen = malloc(bin_size * sizeof(_Bool));
|
||||
if (seen == NULL) {
|
||||
|
@ -509,7 +497,10 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// Base round
|
||||
|
||||
#pragma omp for
|
||||
size_t *local_unique = NULL;
|
||||
size_t *local_ends = NULL;
|
||||
|
||||
#pragma omp for schedule(dynamic)
|
||||
for (size_t bin_i = 0; bin_i < bin_count; bin_i++) {
|
||||
const size_t block_i_start = bin_i * bin_size;
|
||||
size_t bin_i_size = row_count - block_i_start;
|
||||
|
@ -521,64 +512,67 @@ int main(int argc, char *argv[]) {
|
|||
cg_buf, &a_in_perm, &b_in_perm, a_out_perms, b_out_perms,
|
||||
block_i_start, bin_i_size);
|
||||
|
||||
pthread_rwlock_wrlock(&vector_lock);
|
||||
{
|
||||
size_t unique_count = 0;
|
||||
for (size_t i = 0; i < bin_i_size; i++) {
|
||||
if (!seen[i]) {
|
||||
stbds_arrput(unique, block_i_start + i);
|
||||
unique_count++;
|
||||
}
|
||||
}
|
||||
if (stbds_arrlenu(ends) == 0) {
|
||||
stbds_arrput(ends, unique_count);
|
||||
} else {
|
||||
size_t last_end = stbds_arrlast(ends);
|
||||
stbds_arrput(ends, last_end + unique_count);
|
||||
size_t unique_count = 0;
|
||||
for (size_t i = 0; i < bin_i_size; i++) {
|
||||
if (!seen[i]) {
|
||||
stbds_arrput(local_unique, block_i_start + i);
|
||||
unique_count++;
|
||||
}
|
||||
}
|
||||
pthread_rwlock_unlock(&vector_lock);
|
||||
size_t ends_len = stbds_arrlenu(local_ends);
|
||||
if (ends_len == 0) {
|
||||
stbds_arrput(local_ends, unique_count);
|
||||
} else {
|
||||
size_t last_end = stbds_arrlast(local_ends);
|
||||
stbds_arrput(local_ends, last_end + unique_count);
|
||||
}
|
||||
|
||||
fprintf(stderr, ".");
|
||||
fflush(stderr);
|
||||
|
||||
} // End of omp for
|
||||
// Implicit barrier
|
||||
|
||||
/*
|
||||
// Merge locals
|
||||
pthread_rwlock_wrlock(&vector_lock);
|
||||
{
|
||||
for (size_t i = 0; i < stbds_arrlenu(local_unique); i++) {
|
||||
stbds_arrpush(unique, local_unique[i]);
|
||||
}
|
||||
size_t ends_base_value = 0;
|
||||
if (stbds_arrlen(ends) > 0) {
|
||||
ends_base_value = stbds_arrlast(ends);
|
||||
}
|
||||
for (size_t i = 0; i < stbds_arrlenu(local_ends); i++) {
|
||||
stbds_arrpush(ends, local_ends[i] + ends_base_value);
|
||||
}
|
||||
}
|
||||
pthread_rwlock_unlock(&vector_lock);
|
||||
|
||||
#pragma omp barrier
|
||||
|
||||
stbds_arrfree(local_unique);
|
||||
stbds_arrfree(local_ends);
|
||||
local_unique = NULL;
|
||||
local_ends = NULL;
|
||||
|
||||
#pragma omp single
|
||||
{
|
||||
printf("Base round results:\n");
|
||||
size_t count = stbds_arrlen(ends);
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
size_t *block;
|
||||
size_t len;
|
||||
if (i == 0) {
|
||||
len = ends[0];
|
||||
block = unique;
|
||||
} else {
|
||||
len = ends[i] - ends[i - 1];
|
||||
block = unique + ends[i - 1];
|
||||
}
|
||||
|
||||
printf("%zu[", ends[i]);
|
||||
for (size_t j = 0; j < len; j++) {
|
||||
printf("%zu, ", block[j]);
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
printf("\nReduction rounds:\n");
|
||||
}*/
|
||||
fprintf(stderr, "Base round complete\n");
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
// Reductions round
|
||||
|
||||
size_t round_bin_count;
|
||||
{
|
||||
pthread_rwlock_rdlock(&vector_lock);
|
||||
round_bin_count = stbds_arrlenu(ends);
|
||||
pthread_rwlock_unlock(&vector_lock);
|
||||
}
|
||||
pthread_rwlock_rdlock(&vector_lock);
|
||||
{ round_bin_count = stbds_arrlenu(ends); }
|
||||
pthread_rwlock_unlock(&vector_lock);
|
||||
|
||||
while (round_bin_count > 1) {
|
||||
const size_t pairs = round_bin_count / 2;
|
||||
|
||||
#pragma omp for
|
||||
#pragma omp for schedule(dynamic)
|
||||
for (size_t pair_i = 0; pair_i < pairs; pair_i++) {
|
||||
size_t *first_block_idxs, *second_block_idxs;
|
||||
size_t first_block_len, second_block_len;
|
||||
|
@ -598,81 +592,76 @@ int main(int argc, char *argv[]) {
|
|||
b_out_perms, first_block_idxs, first_block_len,
|
||||
second_block_idxs, second_block_len);
|
||||
|
||||
pthread_rwlock_wrlock(&vector_swap_lock);
|
||||
{
|
||||
size_t reduction_size = first_block_len;
|
||||
for (size_t i = 0; i < first_block_len; i++) {
|
||||
stbds_arrput(unique_swap, first_block_idxs[i]);
|
||||
stbds_arrput(local_unique, first_block_idxs[i]);
|
||||
}
|
||||
for (size_t i = 0; i < second_block_len; i++) {
|
||||
if (!seen[i]) {
|
||||
stbds_arrput(unique_swap, second_block_idxs[i]);
|
||||
stbds_arrput(local_unique, second_block_idxs[i]);
|
||||
reduction_size++;
|
||||
}
|
||||
}
|
||||
if (stbds_arrlenu(ends_swap) == 0) {
|
||||
stbds_arrput(ends_swap, reduction_size);
|
||||
if (stbds_arrlenu(local_ends) == 0) {
|
||||
stbds_arrput(local_ends, reduction_size);
|
||||
} else {
|
||||
size_t last_end = stbds_arrlast(ends_swap);
|
||||
stbds_arrput(ends_swap, last_end + reduction_size);
|
||||
size_t last_end = stbds_arrlast(local_ends);
|
||||
stbds_arrput(local_ends, last_end + reduction_size);
|
||||
}
|
||||
}
|
||||
pthread_rwlock_unlock(&vector_swap_lock);
|
||||
|
||||
} // End of loop over pairs
|
||||
// Implicit barrier
|
||||
// This barrier is important, as it ensures `unique` and `ends` to be
|
||||
// read only without locking.
|
||||
// For the same reason, and in conjunction over the following #single
|
||||
// section, there is no need to lock anything in the following #single
|
||||
// section
|
||||
// Implicit barrier. Important because of the following #single
|
||||
|
||||
#pragma omp single
|
||||
{
|
||||
if (IsOdd(round_bin_count)) {
|
||||
// Push the remaining bin into the swap variables
|
||||
// Note that stbds_arrlenu(ends)) is guaranteed to be >= 2 at this
|
||||
// point
|
||||
size_t *odd_block_idxs = unique + ends[stbds_arrlenu(ends) - 2];
|
||||
size_t odd_block_len =
|
||||
stbds_arrlast(ends) - ends[stbds_arrlenu(ends) - 2];
|
||||
|
||||
for (size_t i = 0; i < odd_block_len; i++) {
|
||||
stbds_arrput(unique_swap, odd_block_idxs[i]);
|
||||
stbds_arrput(local_unique, odd_block_idxs[i]);
|
||||
}
|
||||
size_t last_end = stbds_arrlast(ends_swap);
|
||||
stbds_arrput(ends_swap, last_end + odd_block_len);
|
||||
}
|
||||
|
||||
/*
|
||||
size_t count = stbds_arrlen(ends_swap);
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
size_t *block;
|
||||
size_t len;
|
||||
if (i == 0) {
|
||||
len = ends_swap[0];
|
||||
block = unique_swap;
|
||||
if (stbds_arrlenu(local_ends) > 0) {
|
||||
size_t last_end = stbds_arrlast(local_ends);
|
||||
stbds_arrput(local_ends, last_end + odd_block_len);
|
||||
} else {
|
||||
len = ends_swap[i] - ends_swap[i - 1];
|
||||
block = unique_swap + ends_swap[i - 1];
|
||||
stbds_arrput(local_ends, odd_block_len);
|
||||
}
|
||||
|
||||
printf("%zu[", ends_swap[i]);
|
||||
for (size_t j = 0; j < len; j++) {
|
||||
printf("%zu, ", block[j]);
|
||||
}
|
||||
printf("] ");
|
||||
}
|
||||
printf("\n");
|
||||
*/
|
||||
|
||||
|
||||
stbds_arrfree(unique);
|
||||
stbds_arrfree(ends);
|
||||
unique = unique_swap;
|
||||
ends = ends_swap;
|
||||
unique_swap = NULL;
|
||||
ends_swap = NULL;
|
||||
unique = NULL;
|
||||
ends = NULL;
|
||||
} // End of single section
|
||||
// Implicit barrier
|
||||
// Impilcit barrier. Important due to the clearing of unique, local.
|
||||
|
||||
// Merge locals into global
|
||||
pthread_rwlock_wrlock(&vector_lock);
|
||||
{
|
||||
for (size_t i = 0; i < stbds_arrlenu(local_unique); i++) {
|
||||
stbds_arrpush(unique, local_unique[i]);
|
||||
}
|
||||
size_t ends_base_value = 0;
|
||||
if (stbds_arrlen(ends) > 0) {
|
||||
ends_base_value = stbds_arrlast(ends);
|
||||
}
|
||||
for (size_t i = 0; i < stbds_arrlenu(local_ends); i++) {
|
||||
stbds_arrpush(ends, local_ends[i] + ends_base_value);
|
||||
}
|
||||
}
|
||||
pthread_rwlock_unlock(&vector_lock);
|
||||
|
||||
stbds_arrfree(local_unique);
|
||||
stbds_arrfree(local_ends);
|
||||
local_unique = NULL;
|
||||
local_ends = NULL;
|
||||
|
||||
#pragma omp barrier
|
||||
|
||||
pthread_rwlock_rdlock(&vector_lock);
|
||||
{ round_bin_count = stbds_arrlenu(ends); }
|
||||
|
|
Loading…
Reference in New Issue