Less blocking

2023-06-05 23:51:56 +01:00 · 2023-06-05 23:51:56 +01:00 · 3a79b7072e
parent 1209fd27ec
commit 3a79b7072e
1 changed files with 95 additions and 106 deletions
--- a/src/main.c
+++ b/src/main.c
@ -210,14 +210,10 @@ static void UniqueInBlock(const size_t a_in, const size_t b_in,
            ResetConditionalPermutations(b_out_perms, b_in);
            while (!b_out_perms[b_in - 1].exhausted) {
              // Compare the two rows
-              // TODO: Figure out why both A==B and B==A need to be checked.
-              // Of note is that the pathological case I found occurs under a
-              //  party swap.
-              // See also: the same problem in UniqueInSubsetPair

              FromCgToP(a_out, b_out, a_in, b_in, rhs, p_buf,
-                  a_in_perm->permutation, b_in_perm->permutation,
-                  a_out_perms, b_out_perms);
+                        a_in_perm->permutation, b_in_perm->permutation,
+                        a_out_perms, b_out_perms);

              {
                FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
@ -348,14 +344,10 @@ static void UniqueInSubsetPair(
          while (!a_out_perms[a_in - 1].exhausted) {
            ResetConditionalPermutations(b_out_perms, b_in);
            while (!b_out_perms[b_in - 1].exhausted) {
-              // TODO: Figure out why both A==B and B==A need to be checked.
-              // Of note is that the pathological case I found occurs under a
-              //  party swap.
-
              // Compare the two rows
              FromCgToP(a_out, b_out, a_in, b_in, rhs, p_buf,
-                  a_in_perm->permutation, b_in_perm->permutation,
-                  a_out_perms, b_out_perms);
+                        a_in_perm->permutation, b_in_perm->permutation,
+                        a_out_perms, b_out_perms);

              {
                FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
@ -448,8 +440,6 @@ int main(int argc, char *argv[]) {
  size_t *unique = NULL;
  size_t *ends =
      NULL;  // Bin i contains elements with indices ends[i-1]..ends[i]
-  size_t *unique_swap = NULL;
-  size_t *ends_swap = NULL;

  // TODO: Optimization: only one lock is needed
  pthread_rwlockattr_t attr;
@ -457,12 +447,10 @@ int main(int argc, char *argv[]) {
  pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
  pthread_rwlock_t vector_lock, vector_swap_lock;
  pthread_rwlock_init(&vector_lock, &attr);
-  pthread_rwlock_init(&vector_swap_lock, &attr);

 #pragma omp parallel default(none)                                             \
    shared(stderr, args, a_out, b_out, a_in, b_in, row_len, matrix, row_count, \
-               bin_size, bin_count, unique, ends, unique_swap, ends_swap,      \
-               vector_lock, vector_swap_lock)
+               bin_size, bin_count, unique, ends, vector_lock)
  {
    _Bool *seen = malloc(bin_size * sizeof(_Bool));
    if (seen == NULL) {
@ -509,7 +497,10 @@ int main(int argc, char *argv[]) {

    // Base round

-#pragma omp for
+    size_t *local_unique = NULL;
+    size_t *local_ends = NULL;
+
+#pragma omp for schedule(dynamic)
    for (size_t bin_i = 0; bin_i < bin_count; bin_i++) {
      const size_t block_i_start = bin_i * bin_size;
      size_t bin_i_size = row_count - block_i_start;
@ -521,64 +512,67 @@ int main(int argc, char *argv[]) {
                    cg_buf, &a_in_perm, &b_in_perm, a_out_perms, b_out_perms,
                    block_i_start, bin_i_size);

-      pthread_rwlock_wrlock(&vector_lock);
-      {
-        size_t unique_count = 0;
-        for (size_t i = 0; i < bin_i_size; i++) {
-          if (!seen[i]) {
-            stbds_arrput(unique, block_i_start + i);
-            unique_count++;
-          }
-        }
-        if (stbds_arrlenu(ends) == 0) {
-          stbds_arrput(ends, unique_count);
-        } else {
-          size_t last_end = stbds_arrlast(ends);
-          stbds_arrput(ends, last_end + unique_count);
+      size_t unique_count = 0;
+      for (size_t i = 0; i < bin_i_size; i++) {
+        if (!seen[i]) {
+          stbds_arrput(local_unique, block_i_start + i);
+          unique_count++;
        }
      }
-      pthread_rwlock_unlock(&vector_lock);
+      size_t ends_len = stbds_arrlenu(local_ends);
+      if (ends_len == 0) {
+        stbds_arrput(local_ends, unique_count);
+      } else {
+        size_t last_end = stbds_arrlast(local_ends);
+        stbds_arrput(local_ends, last_end + unique_count);
+      }
+
+      fprintf(stderr, ".");
+      fflush(stderr);

    }  // End of omp for
       // Implicit barrier

-    /*
+    // Merge locals
+    pthread_rwlock_wrlock(&vector_lock);
+    {
+      for (size_t i = 0; i < stbds_arrlenu(local_unique); i++) {
+        stbds_arrpush(unique, local_unique[i]);
+      }
+      size_t ends_base_value = 0;
+      if (stbds_arrlen(ends) > 0) {
+        ends_base_value = stbds_arrlast(ends);
+      }
+      for (size_t i = 0; i < stbds_arrlenu(local_ends); i++) {
+        stbds_arrpush(ends, local_ends[i] + ends_base_value);
+      }
+    }
+    pthread_rwlock_unlock(&vector_lock);
+
+#pragma omp barrier
+
+    stbds_arrfree(local_unique);
+    stbds_arrfree(local_ends);
+    local_unique = NULL;
+    local_ends = NULL;
+
 #pragma omp single
    {
-      printf("Base round results:\n");
-      size_t count = stbds_arrlen(ends);
-      for (size_t i = 0; i < count; i++) {
-        size_t *block;
-        size_t len;
-        if (i == 0) {
-          len = ends[0];
-          block = unique;
-        } else {
-          len = ends[i] - ends[i - 1];
-          block = unique + ends[i - 1];
-        }
-
-        printf("%zu[", ends[i]);
-        for (size_t j = 0; j < len; j++) {
-          printf("%zu, ", block[j]);
-        }
-        printf("]\n");
-      }
-      printf("\nReduction rounds:\n");
-    }*/
+      fprintf(stderr, "Base round complete\n");
+      fflush(stderr);
+    }

    // Reductions round

    size_t round_bin_count;
-    {
-      pthread_rwlock_rdlock(&vector_lock);
-      round_bin_count = stbds_arrlenu(ends);
-      pthread_rwlock_unlock(&vector_lock);
-    }
+    pthread_rwlock_rdlock(&vector_lock);
+    { round_bin_count = stbds_arrlenu(ends); }
+    pthread_rwlock_unlock(&vector_lock);
+
    while (round_bin_count > 1) {
      const size_t pairs = round_bin_count / 2;

-#pragma omp for
+#pragma omp for schedule(dynamic)
      for (size_t pair_i = 0; pair_i < pairs; pair_i++) {
        size_t *first_block_idxs, *second_block_idxs;
        size_t first_block_len, second_block_len;
@ -598,81 +592,76 @@ int main(int argc, char *argv[]) {
                           b_out_perms, first_block_idxs, first_block_len,
                           second_block_idxs, second_block_len);

-        pthread_rwlock_wrlock(&vector_swap_lock);
        {
          size_t reduction_size = first_block_len;
          for (size_t i = 0; i < first_block_len; i++) {
-            stbds_arrput(unique_swap, first_block_idxs[i]);
+            stbds_arrput(local_unique, first_block_idxs[i]);
          }
          for (size_t i = 0; i < second_block_len; i++) {
            if (!seen[i]) {
-              stbds_arrput(unique_swap, second_block_idxs[i]);
+              stbds_arrput(local_unique, second_block_idxs[i]);
              reduction_size++;
            }
          }
-          if (stbds_arrlenu(ends_swap) == 0) {
-            stbds_arrput(ends_swap, reduction_size);
+          if (stbds_arrlenu(local_ends) == 0) {
+            stbds_arrput(local_ends, reduction_size);
          } else {
-            size_t last_end = stbds_arrlast(ends_swap);
-            stbds_arrput(ends_swap, last_end + reduction_size);
+            size_t last_end = stbds_arrlast(local_ends);
+            stbds_arrput(local_ends, last_end + reduction_size);
          }
        }
-        pthread_rwlock_unlock(&vector_swap_lock);

      }  // End of loop over pairs
-         // Implicit barrier
-         // This barrier is important, as it ensures `unique` and `ends` to be
-         // read only without locking.
-         // For the same reason, and in conjunction over the following #single
-         // section, there is no need to lock anything in the following #single
-         // section
+         // Implicit barrier. Important because of the following #single
+
 #pragma omp single
      {
        if (IsOdd(round_bin_count)) {
          // Push the remaining bin into the swap variables
-          // Note that stbds_arrlenu(ends)) is guaranteed to be >= 2 at this
-          // point
          size_t *odd_block_idxs = unique + ends[stbds_arrlenu(ends) - 2];
          size_t odd_block_len =
              stbds_arrlast(ends) - ends[stbds_arrlenu(ends) - 2];

          for (size_t i = 0; i < odd_block_len; i++) {
-            stbds_arrput(unique_swap, odd_block_idxs[i]);
+            stbds_arrput(local_unique, odd_block_idxs[i]);
          }
-          size_t last_end = stbds_arrlast(ends_swap);
-          stbds_arrput(ends_swap, last_end + odd_block_len);
-        }
-
-        /*
-        size_t count = stbds_arrlen(ends_swap);
-        for (size_t i = 0; i < count; i++) {
-          size_t *block;
-          size_t len;
-          if (i == 0) {
-            len = ends_swap[0];
-            block = unique_swap;
+          if (stbds_arrlenu(local_ends) > 0) {
+            size_t last_end = stbds_arrlast(local_ends);
+            stbds_arrput(local_ends, last_end + odd_block_len);
          } else {
-            len = ends_swap[i] - ends_swap[i - 1];
-            block = unique_swap + ends_swap[i - 1];
+            stbds_arrput(local_ends, odd_block_len);
          }
-
-          printf("%zu[", ends_swap[i]);
-          for (size_t j = 0; j < len; j++) {
-            printf("%zu, ", block[j]);
-          }
-          printf("] ");
        }
-        printf("\n");
-        */
-
+        
        stbds_arrfree(unique);
        stbds_arrfree(ends);
-        unique = unique_swap;
-        ends = ends_swap;
-        unique_swap = NULL;
-        ends_swap = NULL;
+        unique = NULL;
+        ends = NULL;
      }  // End of single section
-         // Implicit barrier
+         // Impilcit barrier. Important due to the clearing of unique, local.
+
+      // Merge locals into global
+      pthread_rwlock_wrlock(&vector_lock);
+      {
+        for (size_t i = 0; i < stbds_arrlenu(local_unique); i++) {
+          stbds_arrpush(unique, local_unique[i]);
+        }
+        size_t ends_base_value = 0;
+        if (stbds_arrlen(ends) > 0) {
+          ends_base_value = stbds_arrlast(ends);
+        }
+        for (size_t i = 0; i < stbds_arrlenu(local_ends); i++) {
+          stbds_arrpush(ends, local_ends[i] + ends_base_value);
+        }
+      }
+      pthread_rwlock_unlock(&vector_lock);
+
+      stbds_arrfree(local_unique);
+      stbds_arrfree(local_ends);
+      local_unique = NULL;
+      local_ends = NULL;
+
+#pragma omp barrier

      pthread_rwlock_rdlock(&vector_lock);
      { round_bin_count = stbds_arrlenu(ends); }