Aux. script to test for compute-bound

Updated README after changes on this branch
No LFS! It breaks :)
2023-06-16 15:52:37 +01:00 · 2023-06-06 18:36:19 +01:00 · 2023-06-06 16:06:33 +01:00 · 2023-06-06 16:04:30 +01:00 · 2023-06-06 16:03:09 +01:00 · 2023-06-05 23:51:56 +01:00
9 changed files with 685 additions and 230 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,2 +0,0 @@
-data/3333_inq.txt filter=lfs diff=lfs merge=lfs -text
-splits/*.txt filter=lfs diff=lfs merge=lfs -text
--- a/69
+++ b/69
@ -22,7 +22,12 @@ scripts. The main executable can be compiled by running the corresponding

    python3 make.py link

-This will produce a main.exe executable in the root directory.
+This will produce a DEBUG [main.debug.exe] executable in the root directory.
+If a RELEASE environment variable is set, a RELEASE [main.release.exe]
+executable is produced instead, i.e., to produce a release executable, run:
+
+    RELEASE=1 python3 make.py link
+
 See all available recipes by running

    python3 make.py --list
@ -34,6 +39,64 @@ and read more about [sane]: at https://github.com/mikeevmm/sane.

 To use, pipe the input (structured according to the source specification) into
 the executable, while specifying the number of outputs and inputs, in that
-order. E.g.,
+order, followed by a bin size (see the following section for details on the
+meaning of the bin size). E.g.,

-    ./main.exe 2 2 2 2 < data/2222_inq.txt
+    ./main.release.exe 2 2 2 2 48 < data/2222_inq.txt
+    
+
+[[Notes on the Approach Taken]]
+
+Originally, pairs were straightforwardly compared. This was too slow to tackle
+even, for example [data/3332_inq.txt]. (Cf. [data/README].) So, instead, I'm now
+trying a divide-and-conquer approach, now explained.
+
+Consider the input to be N lines. From the ground truths, we know that the
+number of non-equivalent rows, which we will denote k, is such that k << N.
+So consider a subset of L lines (k << L < N). Without loss of generality, take
+N/L to be an integer. For each subset L, at most k lines are non-equivalent.
+We consider a "reduction" within each block, and then comparisons across
+blocks: if there are P permutations to be considered (see [doc/proposal.pdf]
+for details) this means, worst-case scenario, less than
+
+    (N/L)L²P + (N/2L)k²P + (N/4L)k²P + ... + (N/L)/2^log(N/L) k²P 
+  = (N/L)L²P + (N/L)k²P(1 - L/N)
+  = NP[L + k²/L (1 - L/N)]                                              (1)
+
+operations. Compare this with the "straightforward" mode of operation, where
+
+    N²P                                                                 (2)
+
+operations are required. Clearly, the divide-and-conquer approach has an
+advantage as long as the term in square brackets in (1) is significantly smaller
+than N.
+
+In pseudo-code:
+
+    block_size is provided
+    unique := []
+    sizes := []
+
+    base round:
+        # distribute loop
+        for block of rows of size block_size:
+            local_unique := unique rows in local block
+            push local_unique to unique
+            push len(local_unique) to sizes
+
+    reductions:
+        while len(sizes) > 1:
+            unique' := []
+            sizes' := []
+
+            # distribute loop
+            for pair of blocks in (unique, sizes): # Odd ones out are ignored
+                local := concatenation of pair of blocks
+                local_unique = unique rows in local
+                push local_unique to unique'
+                push len(local_unique) to sizes'
+
+            unique := unique'
+            sizes := sizes'
+            
+    done
--- a/aux/computebound.sh
+++ b/aux/computebound.sh
@ -0,0 +1,20 @@
+HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+OUTDIR=$(realpath "$HERE/../data/computebound/")
+EXE=$(realpath "$HERE/../main.release.exe")
+ARGS=(3 3 3 2 64)
+CORES=(1 2 3 4 5 6 7 8 9 10 11 12)
+CHUNKS=(0 1 2 3 4)
+for core in ${CORES[@]}
+do
+    #for chunk in ${CHUNKS[@]}
+    #do
+        #INFILE=$(realpath "$HERE/../data/splits/${chunk}_3233_inq.txt")
+        #OUTFILE="$OUTDIR/core_${core}_"$(basename "$INFILE.txt")
+        #echo "$EXE ${ARGS[@]} < \"$INFILE\"" | tee --append "$OUTFILE"
+        #OMP_THREAD_NUM=$core time --portability --append --output="$OUTFILE" "$EXE" "${ARGS[@]}" < "$INFILE" >> "$OUTFILE"
+    #done
+    INFILE=$(realpath "$HERE/../data/3233_inq.txt")
+    OUTFILE="$OUTDIR/core_${core}_$(basename "$INFILE.txt")"
+    echo "$EXE ${ARGS[@]} < \"$INFILE\"" | tee --append "$OUTFILE"
+    OMP_THREAD_NUM=$core time --portability --append --output="$OUTFILE" "$EXE" "${ARGS[@]}" < "$INFILE" >> "$OUTFILE"
+done
--- a/aux/split_data.py
+++ b/aux/split_data.py
@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+"""Splits an input file into random chunks.
+
+Usage:
+    split_data <input file> <chunks>
+
+See [root]/data/splits/README for a rationale.
+"""
+
+import os
+import argparse
+
+
+CHUNK_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/splits"))
+
+
+def get_args():
+    doclines = __doc__.splitlines()
+    description = doclines[0]
+    epilog = "\n".join(doclines[1:])
+
+    parser = argparse.ArgumentParser(description=description, epilog=epilog)
+    parser.add_argument("<input file>", type=str, help="Input file to split.")
+    parser.add_argument("<chunks>", type=int, help="Number of chunks to produce.")
+    args = vars(parser.parse_args())
+    return args
+
+
+def chunk_fpath(source_path, chunk_i, chunk_total):
+    in_basename = os.path.basename(source_path)
+    out_basename = f"{chunk_total}_{chunk_i}_{in_basename}"
+    return os.path.join(CHUNK_DIR, out_basename)
+
+
+def main():
+    args = get_args()
+    source_path = args["<input file>"]
+    chunk_count = args["<chunks>"]
+
+    with open(source_path, "r") as source_file:
+        line_count = 0
+        for _line in source_file:
+            line_count += 1
+
+        chunk_len = line_count // chunk_count
+
+        source_file.seek(0)
+
+        for chunk_i in range(chunk_count - 1):
+            outfile_name = chunk_fpath(source_path, chunk_i, chunk_count)
+            print(f'Writing {outfile_name}...')
+            with open(outfile_name, "w") as out_file:
+                chunk_i_len = 0
+                for line in source_file:
+                    out_file.write(line)
+
+                    chunk_i_len += 1
+                    if chunk_i_len >= chunk_len:
+                        break
+
+        if line_count % chunk_len != 0:
+            outfile_name = chunk_fpath(source_path, chunk_count - 1, chunk_count)
+            print(f'Writing {outfile_name}...')
+            with open(outfile_name, "w") as out_file:
+                for line in source_file:
+                    out_file.write(line)
+
+
+if __name__ == "__main__":
+    main()
--- a/data/.gitignore
+++ b/data/.gitignore
@ -0,0 +1,2 @@
+splits/
+!splits/.gitinclude
--- a/includes/matrix.h
+++ b/includes/matrix.h
@ -19,7 +19,7 @@ typedef struct {
 void DebugPrintMatrix(matrix_t *matrix);

 // Prints a row of a `matrix_t` to standard output.
-void PrintMatrixRow(matrix_t *matrix, size_t row);
+void PrintMatrixRow(const matrix_t *matrix, const size_t row);

 // Parse a matrix from the standard input.
 //
--- a/make.py
+++ b/make.py
@ -7,7 +7,7 @@ from sane import *

 COMPILER = "gcc"
 COMPILE_FLAGS = [
-    "-std=c11",
+    "-std=gnu11",
    "-g3",
    "-O0",
    "-Wall",
@ -16,9 +16,10 @@ COMPILE_FLAGS = [
    "-Wpedantic",
    "-pedantic-errors",
    "-fopenmp",
+    "-pthread",
 ]
-LINK_FLAGS = ["-fopenmp"]
-LIBRARIES = ["-lm"]
+LINK_FLAGS = ["-fopenmp", "-pthread"]
+LIBRARIES = ["-lm", "-lpthread"]
 OBJ_DIR = os.path.join("obj", "release" if ("RELEASE" in os.environ) else "debug")
 SRC_DIR = "src"
 EXE_NAME = "main." + ("release" if ("RELEASE" in os.environ) else "debug") + ".exe"
@ -26,7 +27,7 @@ EXE_NAME = "main." + ("release" if ("RELEASE" in os.environ) else "debug") + ".e
 ROOT = os.path.dirname(os.path.realpath(__file__))

 if "RELEASE" in os.environ:
-    COMPILE_FLAGS[COMPILE_FLAGS.index("-O0")] = "-O2"
+    COMPILE_FLAGS[COMPILE_FLAGS.index("-O0")] = "-O3"
    COMPILE_FLAGS[COMPILE_FLAGS.index("-g3")] = "-g"
    COMPILE_FLAGS.append("-DRELEASE")
    LINK_FLAGS.append("-DRELEASE")
--- a/src/main.c
+++ b/src/main.c
@ -1,5 +1,7 @@
 #include <alloca.h>
 #include <math.h>
+#include <omp.h>
+#include <pthread.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
@ -7,71 +9,15 @@
 #include <time.h>
 #include <unistd.h>

+// #include "../contrib/orlp/ipow.h"
 #define STB_DS_IMPLEMENTATION
 #define STBDS_NO_SHORT_NAMES
-#include "../contrib/orlp/ipow.h"
 #include "../contrib/stb/stb_ds.h"
 #include "../includes/cg.h"
 #include "../includes/matrix.h"
 #include "../includes/permutation.h"

-static uintmax_t _timing_last_count = 0;
-static clock_t _timing_last_time = 0.;
-static float _timing_rate = 0.;
-static float _timing_exp_backoff = 1.;
-
-static void ReportProgress(uintmax_t current, uintmax_t total, clock_t now) {
-  _Bool updated = 0;
-#pragma omp critical
-  {
-    if (current >= _timing_last_count) {
-      float inferred_rate = (float)(current - _timing_last_count) /
-                            ((float)(now - _timing_last_time) / CLOCKS_PER_SEC);
-      if (inferred_rate < _timing_rate) {
-        _timing_rate = inferred_rate;
-        _timing_exp_backoff = 0.3;
-      } else {
-        _timing_rate = _timing_rate * (1. - _timing_exp_backoff) +
-                       inferred_rate * _timing_exp_backoff;
-        _timing_exp_backoff = fmax(0.99 * _timing_exp_backoff, 0.001);
-      }
-      _timing_last_count = current;
-      _timing_last_time = now;
-      updated = 1;
-    }
-  }
-
-  if (updated) {
-    uintmax_t time_left = (total - current) / _timing_rate;
-
-    uintmax_t days = time_left / 60 / 60 / 24;
-    if (days > 0) {
-      fprintf(stderr,
-              "\33[2K\r%zu/%zu (%" PRIuMAX " iter/sec | About %" PRIuMAX
-              " days left.)",
-              current, total, (uintmax_t)_timing_rate, days);
-    } else {
-      uint8_t hours = (time_left % (60 * 60 * 24)) / 60;
-      uint8_t minutes = (time_left % (60 * 60)) / 60;
-      uint8_t seconds = time_left % 60;
-
-      fprintf(stderr,
-              "\33[2K\r%zu/%zu (%" PRIuMAX " iter/sec | About %" PRIu8
-              "h%" PRIu8 "m%" PRIu8 "s left.)",
-              current, total, (uintmax_t)_timing_rate, hours, minutes, seconds);
-    }
-
-    fflush(stderr);
-  }
-}
-
-static uintmax_t Factorial(uint8_t value) {
-  uintmax_t acc = 1;
-  for (uint8_t i = 2; i <= value; i++) {
-    acc *= (uintmax_t)value;
-  }
-  return acc;
-}
+static inline _Bool IsOdd(size_t value) { return (_Bool)(value & 1); }

 static _Bool ReadArguments(char *from, size_t *into, const char *varname) {
  if (!sscanf(from, "%zu", into)) {
@ -82,19 +28,21 @@ static _Bool ReadArguments(char *from, size_t *into, const char *varname) {
 }

 static inline void AdvanceConditionalPermutations(
-    permutation_generator_t *generators, size_t count) {
-  PermutationNext(generators);
+    permutation_generator_t *generators, const size_t count) {
+  PermutationNext(generators + 0);
  for (size_t i = 0; i < count - 1; i++) {
    if (generators[i].exhausted) {
      PermutationReset(generators + i);
      PermutationNext(generators + i + 1);
+    } else {
+      break;
    }
  }
  // If generators[-1] is exhausted, the conditional permutations are exhausted.
 }

 static inline void ResetConditionalPermutations(
-    permutation_generator_t *generators, size_t count) {
+    permutation_generator_t *generators, const size_t count) {
  for (size_t i = 0; i < count; i++) {
    PermutationReset(generators + i);
  }
@ -112,6 +60,7 @@ typedef struct {
  size_t b_out;
  size_t a_in;
  size_t b_in;
+  size_t bin_size;
  arg_format_t formatting;
 } args_t;

@ -129,17 +78,18 @@ static args_t ParseArgs(size_t argc, char *argv[]) {
    }
  }

-  size_t a_out, b_out, a_in, b_in;
+  size_t a_out, b_out, a_in, b_in, bin_size;
  arg_format_t formatting;

-  if (stbds_arrlen(positionals) != 4) {
+  if (stbds_arrlen(positionals) != 5) {
    goto arg_error;
  }

  if (!ReadArguments(positionals[0], &a_out, "A outputs") ||
      !ReadArguments(positionals[1], &b_out, "B outputs") ||
      !ReadArguments(positionals[2], &a_in, "A inputs") ||
-      !ReadArguments(positionals[3], &b_in, "B inputs")) {
+      !ReadArguments(positionals[3], &b_in, "B inputs") ||
+      !ReadArguments(positionals[4], &bin_size, "bin size")) {
    goto arg_error;
  }

@ -160,6 +110,7 @@ static args_t ParseArgs(size_t argc, char *argv[]) {
      .a_out = a_out,
      .b_in = b_in,
      .b_out = b_out,
+      .bin_size = bin_size,
      .formatting = formatting,
  };

@ -176,20 +127,306 @@ arg_error:
  if (argc > 0) {
    fprintf(stderr, "%s", argv[0]);
  } else {
-    fprintf(stderr, "./exe");
+    fprintf(stderr, "./main.exe");
  }
  fprintf(stderr,
-          " [--cg|--p|--p-raw] <A outputs> <B outputs> <A inputs> <B inputs>");
+          " [--cg|--p|--p-raw] <A outputs> <B outputs> <A inputs> <B inputs> "
+          "<bin size>");
  exit(EXIT_FAILURE);
 }

+static _Bool RowsEquivalent(
+    const size_t a_in, const size_t b_in, const size_t a_out,
+    const size_t b_out, size_t row_len, data_t *p_buf, data_t *cg_buf,
+    permutation_generator_t *a_in_perm, permutation_generator_t *b_in_perm,
+    permutation_generator_t *a_out_perms, permutation_generator_t *b_out_perms,
+    const data_t *restrict lhs, const data_t *restrict rhs) {
+  PermutationReset(a_in_perm);
+  PermutationReset(b_in_perm);
+  for (size_t i = 0; i < a_in; i++) {
+    PermutationReset(a_out_perms + i);
+  }
+  for (size_t i = 0; i < b_in; i++) {
+    PermutationReset(b_out_perms + i);
+  }
+
+  while (!a_in_perm->exhausted) {
+    PermutationReset(b_in_perm);
+    while (!b_in_perm->exhausted) {
+      ResetConditionalPermutations(a_out_perms, a_in);
+      while (!a_out_perms[a_in - 1].exhausted) {
+        ResetConditionalPermutations(b_out_perms, b_in);
+        while (!b_out_perms[b_in - 1].exhausted) {
+          // Compare the two rows
+
+          FromCgToP(a_out, b_out, a_in, b_in, rhs, p_buf,
+                    a_in_perm->permutation, b_in_perm->permutation, a_out_perms,
+                    b_out_perms);
+
+          {
+            FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
+            _Bool equivalent = 1;
+            for (size_t i = row_len; i > 0; i--) {
+              if (lhs[i - 1] != cg_buf[i - 1]) {
+                equivalent = 0;
+                break;
+              }
+            }
+
+            if (equivalent) {
+              return 1;
+            }
+          }
+
+          // If the number of output labels are the same, and the number
+          // of input labels is also the same, we can also check for
+          // equality under party swapping. I don't expect this
+          // conditional to be very penalizing because it's very
+          // predictable.
+          if (a_in == b_in && a_out == b_out) {
+            // Use the results in p_buf
+            PSwapParties(a_out, a_in, p_buf);
+            FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
+            _Bool equivalent = 1;
+            for (size_t i = row_len; i > 0; i--) {
+              if (lhs[i - 1] != cg_buf[i - 1]) {
+                equivalent = 0;
+                break;
+              }
+            }
+
+            if (equivalent) {
+              return 1;
+            }
+          }
+          AdvanceConditionalPermutations(b_out_perms, b_in);
+        }
+
+        AdvanceConditionalPermutations(a_out_perms, a_in);
+      }
+
+      PermutationNext(b_in_perm);
+    }
+
+    PermutationNext(a_in_perm);
+  }
+  
+  return 0;
+}
+
+typedef struct {
+  // The updated pointer to the vector with the indices of the unique rows.
+  size_t *unique;
+  // The number of elements pushed to the vector.
+  size_t count;
+} reduction_result_t;
+
+// Calculates inequivalent rows out of a given block of the rows. A block is a
+// contiguous subset of rows, with indices
+// `block_start..(block_start+row_count)`.
+//
+// Arguments:
+//    a_in        Number of inputs for A
+//    b_in        Number of inputs for B
+//    a_out       Number of outputs for A
+//    a_out       Number of outputs for B
+//    matrix      The matrix whose rows are to be examined
+//    row_len     Length of a matrix row
+//    p_buf       data_t buffer capable of holding a P representation row
+//    cg_buf      data_t buffer capable of holding a CG representation row
+//    a_in_perm   permutation_generator_t for permutations of A's input
+//    b_in_perm   permutation_generator_t for permutations of B's input
+//    a_out_perms permutation_generator_t* array for permutations of A's output
+//    b_out_perms permutation_generator_t* array for permutations of B's output
+//    block_start Where to start counting the indices from.
+//    row_count   Length of block
+//    unique      size_t stbds vector pointer
+//
+// The vector `unique` is only pushed to; it's the user's responsibility to
+// allocate or free it as needed.
+//
+// `row_count` MUST be greater than 0, otherwise behaviour is undefined.
+//
+// `p_buf`, and `cg_buf` WILL be clobbered. They don't need to be initialized,
+// only allocated. It's the caller's responsibility to free these buffers if
+// applicable.
+//
+// The permutation generators are expected to be correctly initialized; this
+// function only resets the permutation generators, it does NOT allocate them
+// or free them.
+static reduction_result_t UniqueInBlock(
+    const size_t a_in, const size_t b_in, const size_t a_out,
+    const size_t b_out, const matrix_t *matrix, size_t row_len, data_t *p_buf,
+    data_t *cg_buf, permutation_generator_t *a_in_perm,
+    permutation_generator_t *b_in_perm, permutation_generator_t *a_out_perms,
+    permutation_generator_t *b_out_perms, const size_t block_start,
+    const size_t row_count, size_t *unique) {
+  size_t *remaining = NULL;
+  size_t *remaining_swap = NULL;
+  size_t push_count = 0;
+
+  // First round: perform the first split of equivalence classes.
+  {
+    stbds_arrput(unique, block_start);
+    push_count++;
+    data_t *root = matrix->head + block_start * matrix->row_len;
+
+    for (size_t rhs_i = 1; rhs_i < row_count; rhs_i++) {
+      data_t *rhs = matrix->head + (block_start + rhs_i) * matrix->row_len;
+      _Bool equivalent = RowsEquivalent(a_in, b_in, a_out, b_out, row_len,
+                                        p_buf, cg_buf, a_in_perm, b_in_perm,
+                                        a_out_perms, b_out_perms, root, rhs);
+      if (!equivalent) {
+        stbds_arrput(remaining, rhs_i);
+      }
+    }
+  }
+
+  // Remaining rounds: perform reduction
+  {
+    while (stbds_arrlenu(remaining) > 0) {
+      size_t lhs_i = stbds_arrpop(remaining);
+      stbds_arrput(unique, block_start + lhs_i);
+      push_count++;
+      data_t *lhs = matrix->head + (block_start + lhs_i) * matrix->row_len;
+
+      for (size_t i = 0; i < stbds_arrlenu(remaining); i++) {
+        size_t rhs_i = remaining[i];
+        data_t *rhs = matrix->head + (block_start + rhs_i) * matrix->row_len;
+
+        _Bool equivalent = RowsEquivalent(a_in, b_in, a_out, b_out, row_len,
+                                          p_buf, cg_buf, a_in_perm, b_in_perm,
+                                          a_out_perms, b_out_perms, lhs, rhs);
+        if (!equivalent) {
+          stbds_arrput(remaining_swap, rhs_i);
+        }
+      }
+
+      stbds_arrfree(remaining);
+      remaining = remaining_swap;
+      remaining_swap = NULL;
+    }
+  }
+
+  stbds_arrfree(remaining);
+  stbds_arrfree(remaining_swap);
+
+  reduction_result_t result = {
+      .unique = unique,
+      .count = push_count,
+  };
+
+  return result;
+}
+
+// Calculates inequivalent rows out of a given subset of the rows. The subset
+// is identified by a pair of arrays of indices.
+//
+// Arguments:
+//    a_in              Number of inputs for A
+//    b_in              Number of inputs for B
+//    a_out             Number of outputs for A
+//    a_out             Number of outputs for B
+//    matrix            The matrix whose rows are to be examined
+//    row_len           Length of a matrix row
+//    p_buf             data_t buffer capable of holding a P representation row
+//    cg_buf            data_t buffer capable of holding a CG representation row
+//    a_in_perm         permutation_generator_t for permutations of A's input
+//    b_in_perm         permutation_generator_t for permutations of B's input
+//    a_out_perms       permutation_generator_t* array for permutations of A's
+//                        output
+//    b_out_perms       permutation_generator_t* array for permutations of B's
+//                        output
+//    first_row_idxs    First array of indices of rows to consider
+//    first_row_count   Length of first_row_idxs[]
+//    second_row_idxs   Second array of indices of rows to consider
+//    second_row_count  Length of second_row_idxs[]
+//    unique            size_t stbds vector pointer
+//
+// The vector `unique` is only pushed to; it's the user's responsibility to
+// allocate or free it as needed.
+//
+// `first_row_count` and `second_row_count` MUST be greater than 0, otherwise
+// behaviour is undefined.
+//
+// `seen`, `p_buf`, and `cg_buf` WILL be clobbered. They don't need to be
+// initialized, only allocated. It's the caller's responsibility to free these
+// buffers if applicable.
+//
+// The permutation generators are expected to be correctly initialized; this
+// function only resets the permutation generators, it does NOT allocate them
+// or free them.
+static reduction_result_t UniqueInSubsetPair(
+    const size_t a_in, const size_t b_in, const size_t a_out,
+    const size_t b_out, const matrix_t *matrix, const size_t row_len,
+    data_t *p_buf, data_t *cg_buf, permutation_generator_t *a_in_perm,
+    permutation_generator_t *b_in_perm, permutation_generator_t *a_out_perms,
+    permutation_generator_t *b_out_perms, const size_t *first_row_idxs,
+    const size_t first_row_count, const size_t *second_row_idxs,
+    const size_t second_row_count, size_t *unique) {
+  // This one is a little more complicated:
+  // Every two elements within `first_row_idxs` are inequivalent, by design, and
+  //  likewise for `second_row_idxs`. This means that if a row A in `first_row`
+  //  is equivalent to a row A' in `second_row`, it cannot be equivalent to any
+  //  other row in `second_row` by transitivity of the equivalence.
+
+  size_t push_count = 0;
+  size_t *classes = NULL;
+
+  for (size_t i = 0; i < first_row_count; i++) {
+    size_t lhs_i = first_row_idxs[i];
+    stbds_arrput(unique, lhs_i);
+    push_count++;
+    stbds_arrput(classes, lhs_i);
+  }
+
+  for (size_t i = 0; i < second_row_count; i++) {
+    if (stbds_arrlenu(classes) == 0) {
+      break;
+    }
+
+    size_t rhs_i = second_row_idxs[i];
+    data_t *rhs = matrix->head + rhs_i * matrix->row_len;
+
+    _Bool equivalent;
+
+    for (size_t i = 0; i < stbds_arrlenu(classes); i++) {
+      size_t lhs_i = classes[i];
+      data_t *lhs = matrix->head + lhs_i * matrix->row_len;
+
+      equivalent = RowsEquivalent(a_in, b_in, a_out, b_out, row_len, p_buf,
+                                  cg_buf, a_in_perm, b_in_perm, a_out_perms,
+                                  b_out_perms, lhs, rhs);
+
+      if (equivalent) {
+        stbds_arrdel(classes, i);
+        break;
+      }
+    }
+
+    if (!equivalent) {
+      stbds_arrput(unique, rhs_i);
+      push_count++;
+    }
+  }
+
+  stbds_arrfree(classes);
+
+  reduction_result_t result = {
+      .unique = unique,
+      .count = push_count,
+  };
+
+  return result;
+}
+
 int main(int argc, char *argv[]) {
-  args_t args = ParseArgs(argc, argv);
-  size_t a_out, a_in, b_out, b_in;
-  a_out = args.a_out;
-  b_out = args.b_out;
-  a_in = args.a_in;
-  b_in = args.b_in;
+  const args_t args = ParseArgs(argc, argv);
+  const size_t a_out = args.a_out;
+  const size_t b_out = args.b_out;
+  const size_t a_in = args.a_in;
+  const size_t b_in = args.b_in;
+  const size_t bin_size = args.bin_size;

  if (a_out < 2 || b_out < 2 || a_in < 2 || b_in < 2) {
    fprintf(stderr,
@ -198,28 +435,36 @@ int main(int argc, char *argv[]) {
    exit(EXIT_FAILURE);
  }

-  size_t row_len = (a_out - 1) * (b_out - 1) * a_in * b_in +
-                   (a_out - 1) * a_in + (b_out - 1) * b_in + 1;
+  const size_t row_len = (a_out - 1) * (b_out - 1) * a_in * b_in +
+                         (a_out - 1) * a_in + (b_out - 1) * b_in + 1;
  matrix_t matrix = ParseMatrix(row_len);
-  size_t row_count = matrix.len / matrix.row_len;
+  const size_t row_count = matrix.len / matrix.row_len;

-  _Bool *seen = calloc(row_count, sizeof(_Bool));
-  if (seen == NULL) {
-    fprintf(stderr, "Failed to allocate equivalence flag string. Aborting.");
+  if (bin_size > row_count) {
+    fprintf(stderr,
+            "Bin size cannot be bigger than the number of rows in the input.");
    exit(EXIT_FAILURE);
  }

-  // Start testing for equivalences.
-  const uintmax_t perm_iterations =
-      Factorial(a_in) * Factorial(b_in) *
-      (uintmax_t)upow((uint64_t)(Factorial(a_in)), (uint8_t)a_out) *
-      (uintmax_t)upow((uint64_t)(Factorial(b_in)), (uint8_t)b_out);
-  const uintmax_t total =
-      (uintmax_t)row_count * (uintmax_t)row_count * perm_iterations;
-  uintmax_t tested = 0;
+  // BEGINNING OF COMPARISON ALGORITHM
+
+  const size_t bin_count =
+      ((row_count % bin_size == 0) ? row_count / bin_size
+                                   : row_count / bin_size + 1);
+
+  size_t *unique = NULL;
+  size_t *ends =
+      NULL;  // Bin i contains elements with indices ends[i-1]..ends[i]
+
+  pthread_rwlockattr_t attr;
+  pthread_rwlockattr_init(&attr);
+  pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+  pthread_rwlock_t vector_lock, vector_swap_lock;
+  pthread_rwlock_init(&vector_lock, &attr);
+
 #pragma omp parallel default(none)                                             \
    shared(stderr, args, a_out, b_out, a_in, b_in, row_len, matrix, row_count, \
-               seen, perm_iterations, total, tested)
+               bin_size, bin_count, unique, ends, vector_lock)
  {
    data_t *p_buf = malloc(a_out * b_out * a_in * b_in * sizeof(data_t));
    data_t *cg_buf = malloc((((a_out - 1) * (b_out - 1) * a_in * b_in +
@ -253,120 +498,172 @@ int main(int argc, char *argv[]) {

    for (size_t i = 0; i < a_in; i++) {
      a_out_perms[i] = PermutationNewGenerator(a_out);
-      PermutationReset(a_out_perms + i);
    }
    for (size_t i = 0; i < b_in; i++) {
      b_out_perms[i] = PermutationNewGenerator(b_out);
-      PermutationReset(b_out_perms + i);
    }

+    // Base round
+
+    size_t *local_unique = NULL;
+    size_t *local_ends = NULL;
+
 #pragma omp for schedule(dynamic)
-    for (size_t lhs_i = 0; lhs_i < row_count - 1; lhs_i++) {
+    for (size_t bin_i = 0; bin_i < bin_count; bin_i++) {
+      const size_t block_i_start = bin_i * bin_size;
+      size_t bin_i_size = row_count - block_i_start;
+      if (bin_i_size > bin_size) {
+        bin_i_size = bin_size;
+      }
+
+      reduction_result_t result =
+          UniqueInBlock(a_in, b_in, a_out, b_out, &matrix, row_len, p_buf,
+                        cg_buf, &a_in_perm, &b_in_perm, a_out_perms,
+                        b_out_perms, block_i_start, bin_i_size, local_unique);
+
+      local_unique = result.unique;
+      size_t push_count = result.count;
+      size_t ends_len = stbds_arrlenu(local_ends);
+      if (ends_len == 0) {
+        stbds_arrput(local_ends, push_count);
+      } else {
+        size_t last_end = stbds_arrlast(local_ends);
+        stbds_arrput(local_ends, last_end + push_count);
+      }
+
+      fprintf(stderr, ".");
+      fflush(stderr);
+
+    }  // End of omp for
+       // Implicit barrier
+
+    // Merge locals
+    pthread_rwlock_wrlock(&vector_lock);
+    {
+      for (size_t i = 0; i < stbds_arrlenu(local_unique); i++) {
+        stbds_arrpush(unique, local_unique[i]);
+      }
+      size_t ends_base_value = 0;
+      if (stbds_arrlen(ends) > 0) {
+        ends_base_value = stbds_arrlast(ends);
+      }
+      for (size_t i = 0; i < stbds_arrlenu(local_ends); i++) {
+        stbds_arrpush(ends, local_ends[i] + ends_base_value);
+      }
+    }
+    pthread_rwlock_unlock(&vector_lock);
+
+#pragma omp barrier
+
+    stbds_arrfree(local_unique);
+    stbds_arrfree(local_ends);
+    local_unique = NULL;
+    local_ends = NULL;
+
+#pragma omp single
+    {
+      fprintf(stderr, "Base round complete\n");
+      fflush(stderr);
+    }
+
+    // Reductions round
+
+    size_t round_bin_count;
+    pthread_rwlock_rdlock(&vector_lock);
+    { round_bin_count = stbds_arrlenu(ends); }
+    pthread_rwlock_unlock(&vector_lock);
+
+    while (round_bin_count > 1) {
+      const size_t pairs = round_bin_count / 2;
+
+#pragma omp for schedule(dynamic)
+      for (size_t pair_i = 0; pair_i < pairs; pair_i++) {
+        size_t *first_block_idxs, *second_block_idxs;
+        size_t first_block_len, second_block_len;
+
+        if (pair_i == 0) {
+          first_block_idxs = unique;
+          first_block_len = ends[0];
+        } else {
+          first_block_idxs = unique + ends[2 * pair_i - 1];
+          first_block_len = ends[2 * pair_i] - ends[2 * pair_i - 1];
+        }
+        second_block_idxs = unique + ends[2 * pair_i];
+        second_block_len = ends[2 * pair_i + 1] - ends[2 * pair_i];
+
+        reduction_result_t result = UniqueInSubsetPair(
+            a_in, b_in, a_out, b_out, &matrix, row_len, p_buf, cg_buf,
+            &a_in_perm, &b_in_perm, a_out_perms, b_out_perms, first_block_idxs,
+            first_block_len, second_block_idxs, second_block_len, local_unique);
+
+        local_unique = result.unique;
+        size_t reduction_size = result.count;
+        if (stbds_arrlenu(local_ends) == 0) {
+          stbds_arrput(local_ends, reduction_size);
+        } else {
+          size_t last_end = stbds_arrlast(local_ends);
+          stbds_arrput(local_ends, last_end + reduction_size);
+        }
+
+      }  // End of loop over pairs
+         // Implicit barrier. Important because of the following #single
+
+#pragma omp single
      {
-        clock_t now = clock();
-        uintmax_t tested_now = tested;
-        ReportProgress(tested_now, total, now);
-      }
+        if (IsOdd(round_bin_count)) {
+          // Push the remaining bin into the swap variables
+          size_t *odd_block_idxs = unique + ends[stbds_arrlenu(ends) - 2];
+          size_t odd_block_len =
+              stbds_arrlast(ends) - ends[stbds_arrlenu(ends) - 2];

-      if (seen[lhs_i]) {
-        const uintmax_t remaining = (uintmax_t)row_count * perm_iterations;
-#pragma omp atomic update
-        tested += remaining;
-        continue;
-      }
-
-      data_t *lhs = matrix.head + lhs_i * matrix.row_len;
-
-      for (size_t rhs_i = lhs_i + 1; rhs_i < row_count; rhs_i++) {
-        if (seen[lhs_i]) {
-          const uintmax_t remaining =
-              (uintmax_t)(row_count - lhs_i) * perm_iterations;
-#pragma omp atomic update
-          tested += remaining;
-          break;
-        }
-
-        if (seen[rhs_i]) {
-#pragma omp atomic update
-          tested += perm_iterations;
-          continue;
-        }
-
-        data_t *rhs = matrix.head + rhs_i * matrix.row_len;
-
-        PermutationReset(&a_in_perm);
-        while (!a_in_perm.exhausted) {
-          PermutationReset(&b_in_perm);
-          while (!b_in_perm.exhausted) {
-            ResetConditionalPermutations(a_out_perms, a_out);
-            while (!a_out_perms[a_out - 1].exhausted) {
-              ResetConditionalPermutations(b_out_perms, b_out);
-              while (!b_out_perms[b_out - 1].exhausted) {
-                if (seen[lhs_i]) {
-                  goto skip_permutations;
-                }
-
-                // Compare the two rows
-                FromCgToP(a_out, b_out, a_in, b_in, rhs, p_buf,
-                          a_in_perm.permutation, b_in_perm.permutation,
-                          a_out_perms, b_out_perms);
-
-                {
-                  FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
-                  _Bool equivalent = 1;
-                  for (size_t i = row_len; i > 0; i--) {
-                    if (lhs[i - 1] != cg_buf[i - 1]) {
-                      equivalent = 0;
-                      break;
-                    }
-                  }
-
-                  if (equivalent) {
-                    seen[rhs_i] = 1;
-                    goto skip_permutations;
-                  }
-                }
-
-                // If the number of output labels are the same, and the number
-                // of input labels is also the same, we can also check for
-                // equality under party swapping. I don't expect this
-                // conditional to be very penalizing because it's very
-                // predictable.
-                if (a_in == b_in && a_out == b_out) {
-                  // Use the results in p_buf
-                  PSwapParties(a_out, a_in, p_buf);
-                  FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
-                  _Bool equivalent = 1;
-                  for (size_t i = row_len; i > 0; i--) {
-                    if (lhs[i - 1] != cg_buf[i - 1]) {
-                      equivalent = 0;
-                      break;
-                    }
-                  }
-
-                  if (equivalent) {
-                    seen[rhs_i] = 1;
-                    goto skip_permutations;
-                  }
-                }
-
-                AdvanceConditionalPermutations(b_out_perms, b_out);
-              }
-
-              AdvanceConditionalPermutations(a_out_perms, a_out);
-            }
-
-            PermutationNext(&b_in_perm);
+          for (size_t i = 0; i < odd_block_len; i++) {
+            stbds_arrput(local_unique, odd_block_idxs[i]);
+          }
+          if (stbds_arrlenu(local_ends) > 0) {
+            size_t last_end = stbds_arrlast(local_ends);
+            stbds_arrput(local_ends, last_end + odd_block_len);
+          } else {
+            stbds_arrput(local_ends, odd_block_len);
          }
-
-          PermutationNext(&a_in_perm);
        }

-      skip_permutations:;
-#pragma omp atomic update
-        tested += perm_iterations;
-      }  // For loop over rhs_i
-    }    // For loop over lhs_i
+        stbds_arrfree(unique);
+        stbds_arrfree(ends);
+        unique = NULL;
+        ends = NULL;
+      }  // End of single section
+         // Impilcit barrier. Important due to the clearing of unique, local.
+
+      // Merge locals into global
+      pthread_rwlock_wrlock(&vector_lock);
+      {
+        for (size_t i = 0; i < stbds_arrlenu(local_unique); i++) {
+          stbds_arrpush(unique, local_unique[i]);
+        }
+        size_t ends_base_value = 0;
+        if (stbds_arrlen(ends) > 0) {
+          ends_base_value = stbds_arrlast(ends);
+        }
+        for (size_t i = 0; i < stbds_arrlenu(local_ends); i++) {
+          stbds_arrpush(ends, local_ends[i] + ends_base_value);
+        }
+      }
+      pthread_rwlock_unlock(&vector_lock);
+
+      stbds_arrfree(local_unique);
+      stbds_arrfree(local_ends);
+      local_unique = NULL;
+      local_ends = NULL;
+
+#pragma omp barrier
+
+      pthread_rwlock_rdlock(&vector_lock);
+      { round_bin_count = stbds_arrlenu(ends); }
+      pthread_rwlock_unlock(&vector_lock);
+    }  // End of pairs reduction loop
+
+    // Program is done!
+    // `unique` should hold the indices of the unique rows.

 #pragma omp single
    {
@ -382,34 +679,34 @@ int main(int argc, char *argv[]) {
      }

      // Print every unique row
-      for (size_t i = 0; i < row_count; i++) {
-        if (!seen[i]) {
-          switch (args.formatting) {
-            default:
-            case kRowFmt: {
-              PrintMatrixRow(&matrix, i);
-            } break;
-            case kCgFmt: {
-              printf("%zu: ", i);
-              PrintCg(a_out, b_out, a_in, b_in,
-                      matrix.head + i * matrix.row_len);
-            } break;
-            case kPFmt: {
-              printf("%zu: ", i);
-              FromCgToP(a_out, b_out, a_in, b_in,
-                        matrix.head + i * matrix.row_len, p_buf,
-                        a_in_perm.permutation, b_in_perm.permutation,
-                        a_out_perms, b_out_perms);
-              PrintP(a_out, b_out, a_in, b_in, p_buf);
-            } break;
-            case kPRawFmt: {
-              FromCgToP(a_out, b_out, a_in, b_in,
-                        matrix.head + i * matrix.row_len, p_buf,
-                        a_in_perm.permutation, b_in_perm.permutation,
-                        a_out_perms, b_out_perms);
-              PrintPRaw(a_out, b_out, a_in, b_in, p_buf);
-            } break;
-          }
+      size_t unique_count = stbds_arrlenu(unique);
+      for (size_t i = 0; i < unique_count; i++) {
+        size_t row_idx = unique[i];
+        switch (args.formatting) {
+          default:
+          case kRowFmt: {
+            PrintMatrixRow(&matrix, row_idx);
+          } break;
+          case kCgFmt: {
+            printf("%zu: ", i);
+            PrintCg(a_out, b_out, a_in, b_in,
+                    matrix.head + row_idx * matrix.row_len);
+          } break;
+          case kPFmt: {
+            printf("%zu: ", i);
+            FromCgToP(a_out, b_out, a_in, b_in,
+                      matrix.head + row_idx * matrix.row_len, p_buf,
+                      a_in_perm.permutation, b_in_perm.permutation, a_out_perms,
+                      b_out_perms);
+            PrintP(a_out, b_out, a_in, b_in, p_buf);
+          } break;
+          case kPRawFmt: {
+            FromCgToP(a_out, b_out, a_in, b_in,
+                      matrix.head + row_idx * matrix.row_len, p_buf,
+                      a_in_perm.permutation, b_in_perm.permutation, a_out_perms,
+                      b_out_perms);
+            PrintPRaw(a_out, b_out, a_in, b_in, p_buf);
+          } break;
        }
      }
    }  // End of omp single region
@ -429,7 +726,10 @@ int main(int argc, char *argv[]) {
  }  // End of omp parallel region

  // Free all the memory so the sanitizer is happy.
-  free(seen);
+  pthread_rwlock_destroy(&vector_lock);
+  pthread_rwlock_destroy(&vector_swap_lock);
+  stbds_arrfree(unique);
+  stbds_arrfree(ends);
  MatrixFree(&matrix);
  return EXIT_SUCCESS;
 }
--- a/src/matrix.c
+++ b/src/matrix.c
@ -21,7 +21,7 @@ void DebugPrintMatrix(matrix_t *matrix) {
 }

 // Prints a row of a `matrix_t` to standard output.
-void PrintMatrixRow(matrix_t *matrix, size_t row) {
+void PrintMatrixRow(const matrix_t *matrix, const size_t row) {
  for (size_t entry = 0; entry < matrix->row_len; entry++) {
    printf("%" PRImDATA, *(matrix->head + row * (matrix->row_len) + entry));
    if (entry != matrix->row_len - 1) {
Author	SHA1	Message	Date
Miguel M	077c5ef61d	Aux. script to test for compute-bound	2023-06-16 15:52:37 +01:00
Miguel M	2f2c65d360	Updated README after changes on this branch	2023-06-06 18:36:19 +01:00
Miguel M	b60644eecc	No LFS! It breaks :)	2023-06-06 16:06:33 +01:00
Miguel M	9671205c15	Aux script to split input data files into chunks	2023-06-06 16:04:30 +01:00
Miguel M	0ea106abcf	(Hopeful) improvements by using transitivity of equivalence	2023-06-06 16:03:09 +01:00
Miguel M	3a79b7072e	Less blocking	2023-06-05 23:51:56 +01:00
Miguel M	1209fd27ec	Fairly sure it's working now	2023-06-05 21:08:32 +01:00
Miguel M	ef6f80aaaf	WIP. pathological case is known	2023-06-05 20:57:26 +01:00