Less blocking

This commit is contained in:
Miguel M 2023-06-05 23:51:56 +01:00
parent 1209fd27ec
commit 3a79b7072e
1 changed files with 95 additions and 106 deletions

View File

@ -210,14 +210,10 @@ static void UniqueInBlock(const size_t a_in, const size_t b_in,
ResetConditionalPermutations(b_out_perms, b_in);
while (!b_out_perms[b_in - 1].exhausted) {
// Compare the two rows
// TODO: Figure out why both A==B and B==A need to be checked.
// Of note is that the pathological case I found occurs under a
// party swap.
// See also: the same problem in UniqueInSubsetPair
FromCgToP(a_out, b_out, a_in, b_in, rhs, p_buf,
a_in_perm->permutation, b_in_perm->permutation,
a_out_perms, b_out_perms);
a_in_perm->permutation, b_in_perm->permutation,
a_out_perms, b_out_perms);
{
FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
@ -348,14 +344,10 @@ static void UniqueInSubsetPair(
while (!a_out_perms[a_in - 1].exhausted) {
ResetConditionalPermutations(b_out_perms, b_in);
while (!b_out_perms[b_in - 1].exhausted) {
// TODO: Figure out why both A==B and B==A need to be checked.
// Of note is that the pathological case I found occurs under a
// party swap.
// Compare the two rows
FromCgToP(a_out, b_out, a_in, b_in, rhs, p_buf,
a_in_perm->permutation, b_in_perm->permutation,
a_out_perms, b_out_perms);
a_in_perm->permutation, b_in_perm->permutation,
a_out_perms, b_out_perms);
{
FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
@ -448,8 +440,6 @@ int main(int argc, char *argv[]) {
size_t *unique = NULL;
size_t *ends =
NULL; // Bin i contains elements with indices ends[i-1]..ends[i]
size_t *unique_swap = NULL;
size_t *ends_swap = NULL;
// TODO: Optimization: only one lock is needed
pthread_rwlockattr_t attr;
@ -457,12 +447,10 @@ int main(int argc, char *argv[]) {
pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
pthread_rwlock_t vector_lock, vector_swap_lock;
pthread_rwlock_init(&vector_lock, &attr);
pthread_rwlock_init(&vector_swap_lock, &attr);
#pragma omp parallel default(none) \
shared(stderr, args, a_out, b_out, a_in, b_in, row_len, matrix, row_count, \
bin_size, bin_count, unique, ends, unique_swap, ends_swap, \
vector_lock, vector_swap_lock)
bin_size, bin_count, unique, ends, vector_lock)
{
_Bool *seen = malloc(bin_size * sizeof(_Bool));
if (seen == NULL) {
@ -509,7 +497,10 @@ int main(int argc, char *argv[]) {
// Base round
#pragma omp for
size_t *local_unique = NULL;
size_t *local_ends = NULL;
#pragma omp for schedule(dynamic)
for (size_t bin_i = 0; bin_i < bin_count; bin_i++) {
const size_t block_i_start = bin_i * bin_size;
size_t bin_i_size = row_count - block_i_start;
@ -521,64 +512,67 @@ int main(int argc, char *argv[]) {
cg_buf, &a_in_perm, &b_in_perm, a_out_perms, b_out_perms,
block_i_start, bin_i_size);
pthread_rwlock_wrlock(&vector_lock);
{
size_t unique_count = 0;
for (size_t i = 0; i < bin_i_size; i++) {
if (!seen[i]) {
stbds_arrput(unique, block_i_start + i);
unique_count++;
}
}
if (stbds_arrlenu(ends) == 0) {
stbds_arrput(ends, unique_count);
} else {
size_t last_end = stbds_arrlast(ends);
stbds_arrput(ends, last_end + unique_count);
size_t unique_count = 0;
for (size_t i = 0; i < bin_i_size; i++) {
if (!seen[i]) {
stbds_arrput(local_unique, block_i_start + i);
unique_count++;
}
}
pthread_rwlock_unlock(&vector_lock);
size_t ends_len = stbds_arrlenu(local_ends);
if (ends_len == 0) {
stbds_arrput(local_ends, unique_count);
} else {
size_t last_end = stbds_arrlast(local_ends);
stbds_arrput(local_ends, last_end + unique_count);
}
fprintf(stderr, ".");
fflush(stderr);
} // End of omp for
// Implicit barrier
/*
// Merge locals
pthread_rwlock_wrlock(&vector_lock);
{
for (size_t i = 0; i < stbds_arrlenu(local_unique); i++) {
stbds_arrpush(unique, local_unique[i]);
}
size_t ends_base_value = 0;
if (stbds_arrlen(ends) > 0) {
ends_base_value = stbds_arrlast(ends);
}
for (size_t i = 0; i < stbds_arrlenu(local_ends); i++) {
stbds_arrpush(ends, local_ends[i] + ends_base_value);
}
}
pthread_rwlock_unlock(&vector_lock);
#pragma omp barrier
stbds_arrfree(local_unique);
stbds_arrfree(local_ends);
local_unique = NULL;
local_ends = NULL;
#pragma omp single
{
printf("Base round results:\n");
size_t count = stbds_arrlen(ends);
for (size_t i = 0; i < count; i++) {
size_t *block;
size_t len;
if (i == 0) {
len = ends[0];
block = unique;
} else {
len = ends[i] - ends[i - 1];
block = unique + ends[i - 1];
}
printf("%zu[", ends[i]);
for (size_t j = 0; j < len; j++) {
printf("%zu, ", block[j]);
}
printf("]\n");
}
printf("\nReduction rounds:\n");
}*/
fprintf(stderr, "Base round complete\n");
fflush(stderr);
}
// Reductions round
size_t round_bin_count;
{
pthread_rwlock_rdlock(&vector_lock);
round_bin_count = stbds_arrlenu(ends);
pthread_rwlock_unlock(&vector_lock);
}
pthread_rwlock_rdlock(&vector_lock);
{ round_bin_count = stbds_arrlenu(ends); }
pthread_rwlock_unlock(&vector_lock);
while (round_bin_count > 1) {
const size_t pairs = round_bin_count / 2;
#pragma omp for
#pragma omp for schedule(dynamic)
for (size_t pair_i = 0; pair_i < pairs; pair_i++) {
size_t *first_block_idxs, *second_block_idxs;
size_t first_block_len, second_block_len;
@ -598,81 +592,76 @@ int main(int argc, char *argv[]) {
b_out_perms, first_block_idxs, first_block_len,
second_block_idxs, second_block_len);
pthread_rwlock_wrlock(&vector_swap_lock);
{
size_t reduction_size = first_block_len;
for (size_t i = 0; i < first_block_len; i++) {
stbds_arrput(unique_swap, first_block_idxs[i]);
stbds_arrput(local_unique, first_block_idxs[i]);
}
for (size_t i = 0; i < second_block_len; i++) {
if (!seen[i]) {
stbds_arrput(unique_swap, second_block_idxs[i]);
stbds_arrput(local_unique, second_block_idxs[i]);
reduction_size++;
}
}
if (stbds_arrlenu(ends_swap) == 0) {
stbds_arrput(ends_swap, reduction_size);
if (stbds_arrlenu(local_ends) == 0) {
stbds_arrput(local_ends, reduction_size);
} else {
size_t last_end = stbds_arrlast(ends_swap);
stbds_arrput(ends_swap, last_end + reduction_size);
size_t last_end = stbds_arrlast(local_ends);
stbds_arrput(local_ends, last_end + reduction_size);
}
}
pthread_rwlock_unlock(&vector_swap_lock);
} // End of loop over pairs
// Implicit barrier
// This barrier is important, as it ensures `unique` and `ends` to be
// read only without locking.
// For the same reason, and in conjunction over the following #single
// section, there is no need to lock anything in the following #single
// section
// Implicit barrier. Important because of the following #single
#pragma omp single
{
if (IsOdd(round_bin_count)) {
// Push the remaining bin into the swap variables
// Note that stbds_arrlenu(ends)) is guaranteed to be >= 2 at this
// point
size_t *odd_block_idxs = unique + ends[stbds_arrlenu(ends) - 2];
size_t odd_block_len =
stbds_arrlast(ends) - ends[stbds_arrlenu(ends) - 2];
for (size_t i = 0; i < odd_block_len; i++) {
stbds_arrput(unique_swap, odd_block_idxs[i]);
stbds_arrput(local_unique, odd_block_idxs[i]);
}
size_t last_end = stbds_arrlast(ends_swap);
stbds_arrput(ends_swap, last_end + odd_block_len);
}
/*
size_t count = stbds_arrlen(ends_swap);
for (size_t i = 0; i < count; i++) {
size_t *block;
size_t len;
if (i == 0) {
len = ends_swap[0];
block = unique_swap;
if (stbds_arrlenu(local_ends) > 0) {
size_t last_end = stbds_arrlast(local_ends);
stbds_arrput(local_ends, last_end + odd_block_len);
} else {
len = ends_swap[i] - ends_swap[i - 1];
block = unique_swap + ends_swap[i - 1];
stbds_arrput(local_ends, odd_block_len);
}
printf("%zu[", ends_swap[i]);
for (size_t j = 0; j < len; j++) {
printf("%zu, ", block[j]);
}
printf("] ");
}
printf("\n");
*/
stbds_arrfree(unique);
stbds_arrfree(ends);
unique = unique_swap;
ends = ends_swap;
unique_swap = NULL;
ends_swap = NULL;
unique = NULL;
ends = NULL;
} // End of single section
// Implicit barrier
// Impilcit barrier. Important due to the clearing of unique, local.
// Merge locals into global
pthread_rwlock_wrlock(&vector_lock);
{
for (size_t i = 0; i < stbds_arrlenu(local_unique); i++) {
stbds_arrpush(unique, local_unique[i]);
}
size_t ends_base_value = 0;
if (stbds_arrlen(ends) > 0) {
ends_base_value = stbds_arrlast(ends);
}
for (size_t i = 0; i < stbds_arrlenu(local_ends); i++) {
stbds_arrpush(ends, local_ends[i] + ends_base_value);
}
}
pthread_rwlock_unlock(&vector_lock);
stbds_arrfree(local_unique);
stbds_arrfree(local_ends);
local_unique = NULL;
local_ends = NULL;
#pragma omp barrier
pthread_rwlock_rdlock(&vector_lock);
{ round_bin_count = stbds_arrlenu(ends); }