Compare commits

...

8 Commits
master ... dac

Author SHA1 Message Date
Miguel M 077c5ef61d Aux. script to test for compute-bound 2023-06-16 15:52:37 +01:00
Miguel M 2f2c65d360 Updated README after changes on this branch 2023-06-06 18:36:19 +01:00
Miguel M b60644eecc No LFS! It breaks :) 2023-06-06 16:06:33 +01:00
Miguel M 9671205c15 Aux script to split input data files into chunks 2023-06-06 16:04:30 +01:00
Miguel M 0ea106abcf (Hopeful) improvements by using transitivity of equivalence 2023-06-06 16:03:09 +01:00
Miguel M 3a79b7072e Less blocking 2023-06-05 23:51:56 +01:00
Miguel M 1209fd27ec Fairly sure it's working now 2023-06-05 21:08:32 +01:00
Miguel M ef6f80aaaf WIP. pathological case is known 2023-06-05 20:57:26 +01:00
9 changed files with 685 additions and 230 deletions

2
.gitattributes vendored
View File

@ -1,2 +0,0 @@
data/3333_inq.txt filter=lfs diff=lfs merge=lfs -text
splits/*.txt filter=lfs diff=lfs merge=lfs -text

69
README
View File

@ -22,7 +22,12 @@ scripts. The main executable can be compiled by running the corresponding
python3 make.py link
This will produce a main.exe executable in the root directory.
This will produce a DEBUG [main.debug.exe] executable in the root directory.
If a RELEASE environment variable is set, a RELEASE [main.release.exe]
executable is produced instead, i.e., to produce a release executable, run:
RELEASE=1 python3 make.py link
See all available recipes by running
python3 make.py --list
@ -34,6 +39,64 @@ and read more about [sane]: at https://github.com/mikeevmm/sane.
To use, pipe the input (structured according to the source specification) into
the executable, while specifying the number of outputs and inputs, in that
order. E.g.,
order, followed by a bin size (see the following section for details on the
meaning of the bin size). E.g.,
./main.exe 2 2 2 2 < data/2222_inq.txt
./main.release.exe 2 2 2 2 48 < data/2222_inq.txt
[[Notes on the Approach Taken]]
Originally, pairs were straightforwardly compared. This was too slow to tackle
even, for example [data/3332_inq.txt]. (Cf. [data/README].) So, instead, I'm now
trying a divide-and-conquer approach, now explained.
Consider the input to be N lines. From the ground truths, we know that the
number of non-equivalent rows, which we will denote k, is such that k << N.
So consider a subset of L lines (k << L < N). Without loss of generality, take
N/L to be an integer. For each subset L, at most k lines are non-equivalent.
We consider a "reduction" within each block, and then comparisons across
blocks: if there are P permutations to be considered (see [doc/proposal.pdf]
for details) this means, worst-case scenario, less than
(N/L)L²P + (N/2L)k²P + (N/4L)k²P + ... + (N/L)/2^log(N/L) k²P
= (N/L)L²P + (N/L)k²P(1 - L/N)
= NP[L + k²/L (1 - L/N)] (1)
operations. Compare this with the "straightforward" mode of operation, where
N²P (2)
operations are required. Clearly, the divide-and-conquer approach has an
advantage as long as the term in square brackets in (1) is significantly smaller
than N.
In pseudo-code:
block_size is provided
unique := []
sizes := []
base round:
# distribute loop
for block of rows of size block_size:
local_unique := unique rows in local block
push local_unique to unique
push len(local_unique) to sizes
reductions:
while len(sizes) > 1:
unique' := []
sizes' := []
# distribute loop
for pair of blocks in (unique, sizes): # Odd ones out are ignored
local := concatenation of pair of blocks
local_unique = unique rows in local
push local_unique to unique'
push len(local_unique) to sizes'
unique := unique'
sizes := sizes'
done

20
aux/computebound.sh Normal file
View File

@ -0,0 +1,20 @@
HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
OUTDIR=$(realpath "$HERE/../data/computebound/")
EXE=$(realpath "$HERE/../main.release.exe")
ARGS=(3 3 3 2 64)
CORES=(1 2 3 4 5 6 7 8 9 10 11 12)
CHUNKS=(0 1 2 3 4)
for core in ${CORES[@]}
do
#for chunk in ${CHUNKS[@]}
#do
#INFILE=$(realpath "$HERE/../data/splits/${chunk}_3233_inq.txt")
#OUTFILE="$OUTDIR/core_${core}_"$(basename "$INFILE.txt")
#echo "$EXE ${ARGS[@]} < \"$INFILE\"" | tee --append "$OUTFILE"
#OMP_THREAD_NUM=$core time --portability --append --output="$OUTFILE" "$EXE" "${ARGS[@]}" < "$INFILE" >> "$OUTFILE"
#done
INFILE=$(realpath "$HERE/../data/3233_inq.txt")
OUTFILE="$OUTDIR/core_${core}_$(basename "$INFILE.txt")"
echo "$EXE ${ARGS[@]} < \"$INFILE\"" | tee --append "$OUTFILE"
OMP_THREAD_NUM=$core time --portability --append --output="$OUTFILE" "$EXE" "${ARGS[@]}" < "$INFILE" >> "$OUTFILE"
done

71
aux/split_data.py Normal file
View File

@ -0,0 +1,71 @@
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""Splits an input file into random chunks.
Usage:
split_data <input file> <chunks>
See [root]/data/splits/README for a rationale.
"""
import os
import argparse
CHUNK_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/splits"))
def get_args():
doclines = __doc__.splitlines()
description = doclines[0]
epilog = "\n".join(doclines[1:])
parser = argparse.ArgumentParser(description=description, epilog=epilog)
parser.add_argument("<input file>", type=str, help="Input file to split.")
parser.add_argument("<chunks>", type=int, help="Number of chunks to produce.")
args = vars(parser.parse_args())
return args
def chunk_fpath(source_path, chunk_i, chunk_total):
in_basename = os.path.basename(source_path)
out_basename = f"{chunk_total}_{chunk_i}_{in_basename}"
return os.path.join(CHUNK_DIR, out_basename)
def main():
args = get_args()
source_path = args["<input file>"]
chunk_count = args["<chunks>"]
with open(source_path, "r") as source_file:
line_count = 0
for _line in source_file:
line_count += 1
chunk_len = line_count // chunk_count
source_file.seek(0)
for chunk_i in range(chunk_count - 1):
outfile_name = chunk_fpath(source_path, chunk_i, chunk_count)
print(f'Writing {outfile_name}...')
with open(outfile_name, "w") as out_file:
chunk_i_len = 0
for line in source_file:
out_file.write(line)
chunk_i_len += 1
if chunk_i_len >= chunk_len:
break
if line_count % chunk_len != 0:
outfile_name = chunk_fpath(source_path, chunk_count - 1, chunk_count)
print(f'Writing {outfile_name}...')
with open(outfile_name, "w") as out_file:
for line in source_file:
out_file.write(line)
if __name__ == "__main__":
main()

2
data/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
splits/
!splits/.gitinclude

View File

@ -19,7 +19,7 @@ typedef struct {
void DebugPrintMatrix(matrix_t *matrix);
// Prints a row of a `matrix_t` to standard output.
void PrintMatrixRow(matrix_t *matrix, size_t row);
void PrintMatrixRow(const matrix_t *matrix, const size_t row);
// Parse a matrix from the standard input.
//

View File

@ -7,7 +7,7 @@ from sane import *
COMPILER = "gcc"
COMPILE_FLAGS = [
"-std=c11",
"-std=gnu11",
"-g3",
"-O0",
"-Wall",
@ -16,9 +16,10 @@ COMPILE_FLAGS = [
"-Wpedantic",
"-pedantic-errors",
"-fopenmp",
"-pthread",
]
LINK_FLAGS = ["-fopenmp"]
LIBRARIES = ["-lm"]
LINK_FLAGS = ["-fopenmp", "-pthread"]
LIBRARIES = ["-lm", "-lpthread"]
OBJ_DIR = os.path.join("obj", "release" if ("RELEASE" in os.environ) else "debug")
SRC_DIR = "src"
EXE_NAME = "main." + ("release" if ("RELEASE" in os.environ) else "debug") + ".exe"
@ -26,7 +27,7 @@ EXE_NAME = "main." + ("release" if ("RELEASE" in os.environ) else "debug") + ".e
ROOT = os.path.dirname(os.path.realpath(__file__))
if "RELEASE" in os.environ:
COMPILE_FLAGS[COMPILE_FLAGS.index("-O0")] = "-O2"
COMPILE_FLAGS[COMPILE_FLAGS.index("-O0")] = "-O3"
COMPILE_FLAGS[COMPILE_FLAGS.index("-g3")] = "-g"
COMPILE_FLAGS.append("-DRELEASE")
LINK_FLAGS.append("-DRELEASE")

View File

@ -1,5 +1,7 @@
#include <alloca.h>
#include <math.h>
#include <omp.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
@ -7,71 +9,15 @@
#include <time.h>
#include <unistd.h>
// #include "../contrib/orlp/ipow.h"
#define STB_DS_IMPLEMENTATION
#define STBDS_NO_SHORT_NAMES
#include "../contrib/orlp/ipow.h"
#include "../contrib/stb/stb_ds.h"
#include "../includes/cg.h"
#include "../includes/matrix.h"
#include "../includes/permutation.h"
static uintmax_t _timing_last_count = 0;
static clock_t _timing_last_time = 0.;
static float _timing_rate = 0.;
static float _timing_exp_backoff = 1.;
static void ReportProgress(uintmax_t current, uintmax_t total, clock_t now) {
_Bool updated = 0;
#pragma omp critical
{
if (current >= _timing_last_count) {
float inferred_rate = (float)(current - _timing_last_count) /
((float)(now - _timing_last_time) / CLOCKS_PER_SEC);
if (inferred_rate < _timing_rate) {
_timing_rate = inferred_rate;
_timing_exp_backoff = 0.3;
} else {
_timing_rate = _timing_rate * (1. - _timing_exp_backoff) +
inferred_rate * _timing_exp_backoff;
_timing_exp_backoff = fmax(0.99 * _timing_exp_backoff, 0.001);
}
_timing_last_count = current;
_timing_last_time = now;
updated = 1;
}
}
if (updated) {
uintmax_t time_left = (total - current) / _timing_rate;
uintmax_t days = time_left / 60 / 60 / 24;
if (days > 0) {
fprintf(stderr,
"\33[2K\r%zu/%zu (%" PRIuMAX " iter/sec | About %" PRIuMAX
" days left.)",
current, total, (uintmax_t)_timing_rate, days);
} else {
uint8_t hours = (time_left % (60 * 60 * 24)) / 60;
uint8_t minutes = (time_left % (60 * 60)) / 60;
uint8_t seconds = time_left % 60;
fprintf(stderr,
"\33[2K\r%zu/%zu (%" PRIuMAX " iter/sec | About %" PRIu8
"h%" PRIu8 "m%" PRIu8 "s left.)",
current, total, (uintmax_t)_timing_rate, hours, minutes, seconds);
}
fflush(stderr);
}
}
static uintmax_t Factorial(uint8_t value) {
uintmax_t acc = 1;
for (uint8_t i = 2; i <= value; i++) {
acc *= (uintmax_t)value;
}
return acc;
}
static inline _Bool IsOdd(size_t value) { return (_Bool)(value & 1); }
static _Bool ReadArguments(char *from, size_t *into, const char *varname) {
if (!sscanf(from, "%zu", into)) {
@ -82,19 +28,21 @@ static _Bool ReadArguments(char *from, size_t *into, const char *varname) {
}
static inline void AdvanceConditionalPermutations(
permutation_generator_t *generators, size_t count) {
PermutationNext(generators);
permutation_generator_t *generators, const size_t count) {
PermutationNext(generators + 0);
for (size_t i = 0; i < count - 1; i++) {
if (generators[i].exhausted) {
PermutationReset(generators + i);
PermutationNext(generators + i + 1);
} else {
break;
}
}
// If generators[-1] is exhausted, the conditional permutations are exhausted.
}
static inline void ResetConditionalPermutations(
permutation_generator_t *generators, size_t count) {
permutation_generator_t *generators, const size_t count) {
for (size_t i = 0; i < count; i++) {
PermutationReset(generators + i);
}
@ -112,6 +60,7 @@ typedef struct {
size_t b_out;
size_t a_in;
size_t b_in;
size_t bin_size;
arg_format_t formatting;
} args_t;
@ -129,17 +78,18 @@ static args_t ParseArgs(size_t argc, char *argv[]) {
}
}
size_t a_out, b_out, a_in, b_in;
size_t a_out, b_out, a_in, b_in, bin_size;
arg_format_t formatting;
if (stbds_arrlen(positionals) != 4) {
if (stbds_arrlen(positionals) != 5) {
goto arg_error;
}
if (!ReadArguments(positionals[0], &a_out, "A outputs") ||
!ReadArguments(positionals[1], &b_out, "B outputs") ||
!ReadArguments(positionals[2], &a_in, "A inputs") ||
!ReadArguments(positionals[3], &b_in, "B inputs")) {
!ReadArguments(positionals[3], &b_in, "B inputs") ||
!ReadArguments(positionals[4], &bin_size, "bin size")) {
goto arg_error;
}
@ -160,6 +110,7 @@ static args_t ParseArgs(size_t argc, char *argv[]) {
.a_out = a_out,
.b_in = b_in,
.b_out = b_out,
.bin_size = bin_size,
.formatting = formatting,
};
@ -176,20 +127,306 @@ arg_error:
if (argc > 0) {
fprintf(stderr, "%s", argv[0]);
} else {
fprintf(stderr, "./exe");
fprintf(stderr, "./main.exe");
}
fprintf(stderr,
" [--cg|--p|--p-raw] <A outputs> <B outputs> <A inputs> <B inputs>");
" [--cg|--p|--p-raw] <A outputs> <B outputs> <A inputs> <B inputs> "
"<bin size>");
exit(EXIT_FAILURE);
}
static _Bool RowsEquivalent(
const size_t a_in, const size_t b_in, const size_t a_out,
const size_t b_out, size_t row_len, data_t *p_buf, data_t *cg_buf,
permutation_generator_t *a_in_perm, permutation_generator_t *b_in_perm,
permutation_generator_t *a_out_perms, permutation_generator_t *b_out_perms,
const data_t *restrict lhs, const data_t *restrict rhs) {
PermutationReset(a_in_perm);
PermutationReset(b_in_perm);
for (size_t i = 0; i < a_in; i++) {
PermutationReset(a_out_perms + i);
}
for (size_t i = 0; i < b_in; i++) {
PermutationReset(b_out_perms + i);
}
while (!a_in_perm->exhausted) {
PermutationReset(b_in_perm);
while (!b_in_perm->exhausted) {
ResetConditionalPermutations(a_out_perms, a_in);
while (!a_out_perms[a_in - 1].exhausted) {
ResetConditionalPermutations(b_out_perms, b_in);
while (!b_out_perms[b_in - 1].exhausted) {
// Compare the two rows
FromCgToP(a_out, b_out, a_in, b_in, rhs, p_buf,
a_in_perm->permutation, b_in_perm->permutation, a_out_perms,
b_out_perms);
{
FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
_Bool equivalent = 1;
for (size_t i = row_len; i > 0; i--) {
if (lhs[i - 1] != cg_buf[i - 1]) {
equivalent = 0;
break;
}
}
if (equivalent) {
return 1;
}
}
// If the number of output labels are the same, and the number
// of input labels is also the same, we can also check for
// equality under party swapping. I don't expect this
// conditional to be very penalizing because it's very
// predictable.
if (a_in == b_in && a_out == b_out) {
// Use the results in p_buf
PSwapParties(a_out, a_in, p_buf);
FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
_Bool equivalent = 1;
for (size_t i = row_len; i > 0; i--) {
if (lhs[i - 1] != cg_buf[i - 1]) {
equivalent = 0;
break;
}
}
if (equivalent) {
return 1;
}
}
AdvanceConditionalPermutations(b_out_perms, b_in);
}
AdvanceConditionalPermutations(a_out_perms, a_in);
}
PermutationNext(b_in_perm);
}
PermutationNext(a_in_perm);
}
return 0;
}
typedef struct {
// The updated pointer to the vector with the indices of the unique rows.
size_t *unique;
// The number of elements pushed to the vector.
size_t count;
} reduction_result_t;
// Calculates inequivalent rows out of a given block of the rows. A block is a
// contiguous subset of rows, with indices
// `block_start..(block_start+row_count)`.
//
// Arguments:
// a_in Number of inputs for A
// b_in Number of inputs for B
// a_out Number of outputs for A
// a_out Number of outputs for B
// matrix The matrix whose rows are to be examined
// row_len Length of a matrix row
// p_buf data_t buffer capable of holding a P representation row
// cg_buf data_t buffer capable of holding a CG representation row
// a_in_perm permutation_generator_t for permutations of A's input
// b_in_perm permutation_generator_t for permutations of B's input
// a_out_perms permutation_generator_t* array for permutations of A's output
// b_out_perms permutation_generator_t* array for permutations of B's output
// block_start Where to start counting the indices from.
// row_count Length of block
// unique size_t stbds vector pointer
//
// The vector `unique` is only pushed to; it's the user's responsibility to
// allocate or free it as needed.
//
// `row_count` MUST be greater than 0, otherwise behaviour is undefined.
//
// `p_buf`, and `cg_buf` WILL be clobbered. They don't need to be initialized,
// only allocated. It's the caller's responsibility to free these buffers if
// applicable.
//
// The permutation generators are expected to be correctly initialized; this
// function only resets the permutation generators, it does NOT allocate them
// or free them.
static reduction_result_t UniqueInBlock(
const size_t a_in, const size_t b_in, const size_t a_out,
const size_t b_out, const matrix_t *matrix, size_t row_len, data_t *p_buf,
data_t *cg_buf, permutation_generator_t *a_in_perm,
permutation_generator_t *b_in_perm, permutation_generator_t *a_out_perms,
permutation_generator_t *b_out_perms, const size_t block_start,
const size_t row_count, size_t *unique) {
size_t *remaining = NULL;
size_t *remaining_swap = NULL;
size_t push_count = 0;
// First round: perform the first split of equivalence classes.
{
stbds_arrput(unique, block_start);
push_count++;
data_t *root = matrix->head + block_start * matrix->row_len;
for (size_t rhs_i = 1; rhs_i < row_count; rhs_i++) {
data_t *rhs = matrix->head + (block_start + rhs_i) * matrix->row_len;
_Bool equivalent = RowsEquivalent(a_in, b_in, a_out, b_out, row_len,
p_buf, cg_buf, a_in_perm, b_in_perm,
a_out_perms, b_out_perms, root, rhs);
if (!equivalent) {
stbds_arrput(remaining, rhs_i);
}
}
}
// Remaining rounds: perform reduction
{
while (stbds_arrlenu(remaining) > 0) {
size_t lhs_i = stbds_arrpop(remaining);
stbds_arrput(unique, block_start + lhs_i);
push_count++;
data_t *lhs = matrix->head + (block_start + lhs_i) * matrix->row_len;
for (size_t i = 0; i < stbds_arrlenu(remaining); i++) {
size_t rhs_i = remaining[i];
data_t *rhs = matrix->head + (block_start + rhs_i) * matrix->row_len;
_Bool equivalent = RowsEquivalent(a_in, b_in, a_out, b_out, row_len,
p_buf, cg_buf, a_in_perm, b_in_perm,
a_out_perms, b_out_perms, lhs, rhs);
if (!equivalent) {
stbds_arrput(remaining_swap, rhs_i);
}
}
stbds_arrfree(remaining);
remaining = remaining_swap;
remaining_swap = NULL;
}
}
stbds_arrfree(remaining);
stbds_arrfree(remaining_swap);
reduction_result_t result = {
.unique = unique,
.count = push_count,
};
return result;
}
// Calculates inequivalent rows out of a given subset of the rows. The subset
// is identified by a pair of arrays of indices.
//
// Arguments:
// a_in Number of inputs for A
// b_in Number of inputs for B
// a_out Number of outputs for A
// a_out Number of outputs for B
// matrix The matrix whose rows are to be examined
// row_len Length of a matrix row
// p_buf data_t buffer capable of holding a P representation row
// cg_buf data_t buffer capable of holding a CG representation row
// a_in_perm permutation_generator_t for permutations of A's input
// b_in_perm permutation_generator_t for permutations of B's input
// a_out_perms permutation_generator_t* array for permutations of A's
// output
// b_out_perms permutation_generator_t* array for permutations of B's
// output
// first_row_idxs First array of indices of rows to consider
// first_row_count Length of first_row_idxs[]
// second_row_idxs Second array of indices of rows to consider
// second_row_count Length of second_row_idxs[]
// unique size_t stbds vector pointer
//
// The vector `unique` is only pushed to; it's the user's responsibility to
// allocate or free it as needed.
//
// `first_row_count` and `second_row_count` MUST be greater than 0, otherwise
// behaviour is undefined.
//
// `seen`, `p_buf`, and `cg_buf` WILL be clobbered. They don't need to be
// initialized, only allocated. It's the caller's responsibility to free these
// buffers if applicable.
//
// The permutation generators are expected to be correctly initialized; this
// function only resets the permutation generators, it does NOT allocate them
// or free them.
static reduction_result_t UniqueInSubsetPair(
const size_t a_in, const size_t b_in, const size_t a_out,
const size_t b_out, const matrix_t *matrix, const size_t row_len,
data_t *p_buf, data_t *cg_buf, permutation_generator_t *a_in_perm,
permutation_generator_t *b_in_perm, permutation_generator_t *a_out_perms,
permutation_generator_t *b_out_perms, const size_t *first_row_idxs,
const size_t first_row_count, const size_t *second_row_idxs,
const size_t second_row_count, size_t *unique) {
// This one is a little more complicated:
// Every two elements within `first_row_idxs` are inequivalent, by design, and
// likewise for `second_row_idxs`. This means that if a row A in `first_row`
// is equivalent to a row A' in `second_row`, it cannot be equivalent to any
// other row in `second_row` by transitivity of the equivalence.
size_t push_count = 0;
size_t *classes = NULL;
for (size_t i = 0; i < first_row_count; i++) {
size_t lhs_i = first_row_idxs[i];
stbds_arrput(unique, lhs_i);
push_count++;
stbds_arrput(classes, lhs_i);
}
for (size_t i = 0; i < second_row_count; i++) {
if (stbds_arrlenu(classes) == 0) {
break;
}
size_t rhs_i = second_row_idxs[i];
data_t *rhs = matrix->head + rhs_i * matrix->row_len;
_Bool equivalent;
for (size_t i = 0; i < stbds_arrlenu(classes); i++) {
size_t lhs_i = classes[i];
data_t *lhs = matrix->head + lhs_i * matrix->row_len;
equivalent = RowsEquivalent(a_in, b_in, a_out, b_out, row_len, p_buf,
cg_buf, a_in_perm, b_in_perm, a_out_perms,
b_out_perms, lhs, rhs);
if (equivalent) {
stbds_arrdel(classes, i);
break;
}
}
if (!equivalent) {
stbds_arrput(unique, rhs_i);
push_count++;
}
}
stbds_arrfree(classes);
reduction_result_t result = {
.unique = unique,
.count = push_count,
};
return result;
}
int main(int argc, char *argv[]) {
args_t args = ParseArgs(argc, argv);
size_t a_out, a_in, b_out, b_in;
a_out = args.a_out;
b_out = args.b_out;
a_in = args.a_in;
b_in = args.b_in;
const args_t args = ParseArgs(argc, argv);
const size_t a_out = args.a_out;
const size_t b_out = args.b_out;
const size_t a_in = args.a_in;
const size_t b_in = args.b_in;
const size_t bin_size = args.bin_size;
if (a_out < 2 || b_out < 2 || a_in < 2 || b_in < 2) {
fprintf(stderr,
@ -198,28 +435,36 @@ int main(int argc, char *argv[]) {
exit(EXIT_FAILURE);
}
size_t row_len = (a_out - 1) * (b_out - 1) * a_in * b_in +
(a_out - 1) * a_in + (b_out - 1) * b_in + 1;
const size_t row_len = (a_out - 1) * (b_out - 1) * a_in * b_in +
(a_out - 1) * a_in + (b_out - 1) * b_in + 1;
matrix_t matrix = ParseMatrix(row_len);
size_t row_count = matrix.len / matrix.row_len;
const size_t row_count = matrix.len / matrix.row_len;
_Bool *seen = calloc(row_count, sizeof(_Bool));
if (seen == NULL) {
fprintf(stderr, "Failed to allocate equivalence flag string. Aborting.");
if (bin_size > row_count) {
fprintf(stderr,
"Bin size cannot be bigger than the number of rows in the input.");
exit(EXIT_FAILURE);
}
// Start testing for equivalences.
const uintmax_t perm_iterations =
Factorial(a_in) * Factorial(b_in) *
(uintmax_t)upow((uint64_t)(Factorial(a_in)), (uint8_t)a_out) *
(uintmax_t)upow((uint64_t)(Factorial(b_in)), (uint8_t)b_out);
const uintmax_t total =
(uintmax_t)row_count * (uintmax_t)row_count * perm_iterations;
uintmax_t tested = 0;
// BEGINNING OF COMPARISON ALGORITHM
const size_t bin_count =
((row_count % bin_size == 0) ? row_count / bin_size
: row_count / bin_size + 1);
size_t *unique = NULL;
size_t *ends =
NULL; // Bin i contains elements with indices ends[i-1]..ends[i]
pthread_rwlockattr_t attr;
pthread_rwlockattr_init(&attr);
pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
pthread_rwlock_t vector_lock, vector_swap_lock;
pthread_rwlock_init(&vector_lock, &attr);
#pragma omp parallel default(none) \
shared(stderr, args, a_out, b_out, a_in, b_in, row_len, matrix, row_count, \
seen, perm_iterations, total, tested)
bin_size, bin_count, unique, ends, vector_lock)
{
data_t *p_buf = malloc(a_out * b_out * a_in * b_in * sizeof(data_t));
data_t *cg_buf = malloc((((a_out - 1) * (b_out - 1) * a_in * b_in +
@ -253,120 +498,172 @@ int main(int argc, char *argv[]) {
for (size_t i = 0; i < a_in; i++) {
a_out_perms[i] = PermutationNewGenerator(a_out);
PermutationReset(a_out_perms + i);
}
for (size_t i = 0; i < b_in; i++) {
b_out_perms[i] = PermutationNewGenerator(b_out);
PermutationReset(b_out_perms + i);
}
// Base round
size_t *local_unique = NULL;
size_t *local_ends = NULL;
#pragma omp for schedule(dynamic)
for (size_t lhs_i = 0; lhs_i < row_count - 1; lhs_i++) {
for (size_t bin_i = 0; bin_i < bin_count; bin_i++) {
const size_t block_i_start = bin_i * bin_size;
size_t bin_i_size = row_count - block_i_start;
if (bin_i_size > bin_size) {
bin_i_size = bin_size;
}
reduction_result_t result =
UniqueInBlock(a_in, b_in, a_out, b_out, &matrix, row_len, p_buf,
cg_buf, &a_in_perm, &b_in_perm, a_out_perms,
b_out_perms, block_i_start, bin_i_size, local_unique);
local_unique = result.unique;
size_t push_count = result.count;
size_t ends_len = stbds_arrlenu(local_ends);
if (ends_len == 0) {
stbds_arrput(local_ends, push_count);
} else {
size_t last_end = stbds_arrlast(local_ends);
stbds_arrput(local_ends, last_end + push_count);
}
fprintf(stderr, ".");
fflush(stderr);
} // End of omp for
// Implicit barrier
// Merge locals
pthread_rwlock_wrlock(&vector_lock);
{
for (size_t i = 0; i < stbds_arrlenu(local_unique); i++) {
stbds_arrpush(unique, local_unique[i]);
}
size_t ends_base_value = 0;
if (stbds_arrlen(ends) > 0) {
ends_base_value = stbds_arrlast(ends);
}
for (size_t i = 0; i < stbds_arrlenu(local_ends); i++) {
stbds_arrpush(ends, local_ends[i] + ends_base_value);
}
}
pthread_rwlock_unlock(&vector_lock);
#pragma omp barrier
stbds_arrfree(local_unique);
stbds_arrfree(local_ends);
local_unique = NULL;
local_ends = NULL;
#pragma omp single
{
fprintf(stderr, "Base round complete\n");
fflush(stderr);
}
// Reductions round
size_t round_bin_count;
pthread_rwlock_rdlock(&vector_lock);
{ round_bin_count = stbds_arrlenu(ends); }
pthread_rwlock_unlock(&vector_lock);
while (round_bin_count > 1) {
const size_t pairs = round_bin_count / 2;
#pragma omp for schedule(dynamic)
for (size_t pair_i = 0; pair_i < pairs; pair_i++) {
size_t *first_block_idxs, *second_block_idxs;
size_t first_block_len, second_block_len;
if (pair_i == 0) {
first_block_idxs = unique;
first_block_len = ends[0];
} else {
first_block_idxs = unique + ends[2 * pair_i - 1];
first_block_len = ends[2 * pair_i] - ends[2 * pair_i - 1];
}
second_block_idxs = unique + ends[2 * pair_i];
second_block_len = ends[2 * pair_i + 1] - ends[2 * pair_i];
reduction_result_t result = UniqueInSubsetPair(
a_in, b_in, a_out, b_out, &matrix, row_len, p_buf, cg_buf,
&a_in_perm, &b_in_perm, a_out_perms, b_out_perms, first_block_idxs,
first_block_len, second_block_idxs, second_block_len, local_unique);
local_unique = result.unique;
size_t reduction_size = result.count;
if (stbds_arrlenu(local_ends) == 0) {
stbds_arrput(local_ends, reduction_size);
} else {
size_t last_end = stbds_arrlast(local_ends);
stbds_arrput(local_ends, last_end + reduction_size);
}
} // End of loop over pairs
// Implicit barrier. Important because of the following #single
#pragma omp single
{
clock_t now = clock();
uintmax_t tested_now = tested;
ReportProgress(tested_now, total, now);
}
if (IsOdd(round_bin_count)) {
// Push the remaining bin into the swap variables
size_t *odd_block_idxs = unique + ends[stbds_arrlenu(ends) - 2];
size_t odd_block_len =
stbds_arrlast(ends) - ends[stbds_arrlenu(ends) - 2];
if (seen[lhs_i]) {
const uintmax_t remaining = (uintmax_t)row_count * perm_iterations;
#pragma omp atomic update
tested += remaining;
continue;
}
data_t *lhs = matrix.head + lhs_i * matrix.row_len;
for (size_t rhs_i = lhs_i + 1; rhs_i < row_count; rhs_i++) {
if (seen[lhs_i]) {
const uintmax_t remaining =
(uintmax_t)(row_count - lhs_i) * perm_iterations;
#pragma omp atomic update
tested += remaining;
break;
}
if (seen[rhs_i]) {
#pragma omp atomic update
tested += perm_iterations;
continue;
}
data_t *rhs = matrix.head + rhs_i * matrix.row_len;
PermutationReset(&a_in_perm);
while (!a_in_perm.exhausted) {
PermutationReset(&b_in_perm);
while (!b_in_perm.exhausted) {
ResetConditionalPermutations(a_out_perms, a_out);
while (!a_out_perms[a_out - 1].exhausted) {
ResetConditionalPermutations(b_out_perms, b_out);
while (!b_out_perms[b_out - 1].exhausted) {
if (seen[lhs_i]) {
goto skip_permutations;
}
// Compare the two rows
FromCgToP(a_out, b_out, a_in, b_in, rhs, p_buf,
a_in_perm.permutation, b_in_perm.permutation,
a_out_perms, b_out_perms);
{
FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
_Bool equivalent = 1;
for (size_t i = row_len; i > 0; i--) {
if (lhs[i - 1] != cg_buf[i - 1]) {
equivalent = 0;
break;
}
}
if (equivalent) {
seen[rhs_i] = 1;
goto skip_permutations;
}
}
// If the number of output labels are the same, and the number
// of input labels is also the same, we can also check for
// equality under party swapping. I don't expect this
// conditional to be very penalizing because it's very
// predictable.
if (a_in == b_in && a_out == b_out) {
// Use the results in p_buf
PSwapParties(a_out, a_in, p_buf);
FromPToCg(a_out, b_out, a_in, b_in, cg_buf, p_buf);
_Bool equivalent = 1;
for (size_t i = row_len; i > 0; i--) {
if (lhs[i - 1] != cg_buf[i - 1]) {
equivalent = 0;
break;
}
}
if (equivalent) {
seen[rhs_i] = 1;
goto skip_permutations;
}
}
AdvanceConditionalPermutations(b_out_perms, b_out);
}
AdvanceConditionalPermutations(a_out_perms, a_out);
}
PermutationNext(&b_in_perm);
for (size_t i = 0; i < odd_block_len; i++) {
stbds_arrput(local_unique, odd_block_idxs[i]);
}
if (stbds_arrlenu(local_ends) > 0) {
size_t last_end = stbds_arrlast(local_ends);
stbds_arrput(local_ends, last_end + odd_block_len);
} else {
stbds_arrput(local_ends, odd_block_len);
}
PermutationNext(&a_in_perm);
}
skip_permutations:;
#pragma omp atomic update
tested += perm_iterations;
} // For loop over rhs_i
} // For loop over lhs_i
stbds_arrfree(unique);
stbds_arrfree(ends);
unique = NULL;
ends = NULL;
} // End of single section
// Impilcit barrier. Important due to the clearing of unique, local.
// Merge locals into global
pthread_rwlock_wrlock(&vector_lock);
{
for (size_t i = 0; i < stbds_arrlenu(local_unique); i++) {
stbds_arrpush(unique, local_unique[i]);
}
size_t ends_base_value = 0;
if (stbds_arrlen(ends) > 0) {
ends_base_value = stbds_arrlast(ends);
}
for (size_t i = 0; i < stbds_arrlenu(local_ends); i++) {
stbds_arrpush(ends, local_ends[i] + ends_base_value);
}
}
pthread_rwlock_unlock(&vector_lock);
stbds_arrfree(local_unique);
stbds_arrfree(local_ends);
local_unique = NULL;
local_ends = NULL;
#pragma omp barrier
pthread_rwlock_rdlock(&vector_lock);
{ round_bin_count = stbds_arrlenu(ends); }
pthread_rwlock_unlock(&vector_lock);
} // End of pairs reduction loop
// Program is done!
// `unique` should hold the indices of the unique rows.
#pragma omp single
{
@ -382,34 +679,34 @@ int main(int argc, char *argv[]) {
}
// Print every unique row
for (size_t i = 0; i < row_count; i++) {
if (!seen[i]) {
switch (args.formatting) {
default:
case kRowFmt: {
PrintMatrixRow(&matrix, i);
} break;
case kCgFmt: {
printf("%zu: ", i);
PrintCg(a_out, b_out, a_in, b_in,
matrix.head + i * matrix.row_len);
} break;
case kPFmt: {
printf("%zu: ", i);
FromCgToP(a_out, b_out, a_in, b_in,
matrix.head + i * matrix.row_len, p_buf,
a_in_perm.permutation, b_in_perm.permutation,
a_out_perms, b_out_perms);
PrintP(a_out, b_out, a_in, b_in, p_buf);
} break;
case kPRawFmt: {
FromCgToP(a_out, b_out, a_in, b_in,
matrix.head + i * matrix.row_len, p_buf,
a_in_perm.permutation, b_in_perm.permutation,
a_out_perms, b_out_perms);
PrintPRaw(a_out, b_out, a_in, b_in, p_buf);
} break;
}
size_t unique_count = stbds_arrlenu(unique);
for (size_t i = 0; i < unique_count; i++) {
size_t row_idx = unique[i];
switch (args.formatting) {
default:
case kRowFmt: {
PrintMatrixRow(&matrix, row_idx);
} break;
case kCgFmt: {
printf("%zu: ", i);
PrintCg(a_out, b_out, a_in, b_in,
matrix.head + row_idx * matrix.row_len);
} break;
case kPFmt: {
printf("%zu: ", i);
FromCgToP(a_out, b_out, a_in, b_in,
matrix.head + row_idx * matrix.row_len, p_buf,
a_in_perm.permutation, b_in_perm.permutation, a_out_perms,
b_out_perms);
PrintP(a_out, b_out, a_in, b_in, p_buf);
} break;
case kPRawFmt: {
FromCgToP(a_out, b_out, a_in, b_in,
matrix.head + row_idx * matrix.row_len, p_buf,
a_in_perm.permutation, b_in_perm.permutation, a_out_perms,
b_out_perms);
PrintPRaw(a_out, b_out, a_in, b_in, p_buf);
} break;
}
}
} // End of omp single region
@ -429,7 +726,10 @@ int main(int argc, char *argv[]) {
} // End of omp parallel region
// Free all the memory so the sanitizer is happy.
free(seen);
pthread_rwlock_destroy(&vector_lock);
pthread_rwlock_destroy(&vector_swap_lock);
stbds_arrfree(unique);
stbds_arrfree(ends);
MatrixFree(&matrix);
return EXIT_SUCCESS;
}

View File

@ -21,7 +21,7 @@ void DebugPrintMatrix(matrix_t *matrix) {
}
// Prints a row of a `matrix_t` to standard output.
void PrintMatrixRow(matrix_t *matrix, size_t row) {
void PrintMatrixRow(const matrix_t *matrix, const size_t row) {
for (size_t entry = 0; entry < matrix->row_len; entry++) {
printf("%" PRImDATA, *(matrix->head + row * (matrix->row_len) + entry));
if (entry != matrix->row_len - 1) {