test/Examples.tests.ts

import { Assembler, compileVM, VM } from '../src/index';
import Trace from '../src/Trace';
import * as regexParser from './util/regexParser';

describe('whynot.js examples', () => {
	// whynot.js was designed to answer the question of *why* a given input does not match a given
	// grammar. It can sometimes even tell you how to extend the input so that it will match. To
	// illustrate this, consider a simple subset of regular expressions.
	describe('regular expressions', () => {
		// We have generated a very simple parser using prsc for the subset of regular expressions
		// consisting of character matches (a-z, lower case), sequences, choices ("|") and grouping
		// using parentheses ("(" and ")"). This will create a simple AST. These functions traverse
		// the AST recursively and generate a whynot program using the provided assembler.
		function compile(
			assembler: Assembler<string, string>,
			ast: regexParser.RegEx,
			recordMissing: boolean
		) {
			function compileTest(ast: regexParser.Test) {
				// A test represents an expected character, e.g., /a/
				if (!recordMissing) {
					// Normally, it is simply represented by a test instruction which fails if
					// the input character is not the expected character.
					assembler.test(function(input: string): boolean {
						return input == ast.value;
					});
					return;
				}

				// To record missing characters, we add a branch for each allowing the VM to
				// skip the character. In both cases, we use a record instruction to
				// remember the character when it is processed.
				assembler.record(ast.value);
				const skipTest = assembler.jump([]);
				// Branch for existing character
				skipTest.data.push(assembler.program.length);
				assembler.test(function(input: string): boolean {
					return input == ast.value;
				});
				const skipBad = assembler.jump([]);
				// Branch for missing character
				skipTest.data.push(assembler.program.length);
				// Prefer the branch where the character exists
				assembler.bad();
				// Join both branches to continue execution
				skipBad.data.push(assembler.program.length);
			}
			function compileAtom(ast: regexParser.Atom) {
				switch (ast.type) {
					case 'choice':
						return compileRegex(ast);

					case 'test':
						return compileTest(ast);
				}
			}
			function compileQuantified(ast: regexParser.Quantified) {
				if (ast.type !== 'repetition') {
					return compileAtom(ast);
				}

				// Not implemented for this example
				throw new Error('Not implemented');
			}
			function compileSeq(ast: regexParser.Seq) {
				// A sequence of characters and/or groups, e.g., /abc/
				// This is represented in the program by simply executing its parts in the
				// specified order.
				ast.forEach(ast => {
					compileQuantified(ast);
				});
			}
			function compileRegex(ast: regexParser.RegEx) {
				// Alternatives, e.g., /a|b|c/
				// These are represented in the VM by forking execution to all options in
				// parallel and merging the surviving threads afterwards.
				const fork = assembler.jump([]);
				type Instruction = typeof fork;
				const joins: Instruction[] = [];
				ast.value.forEach(seq => {
					fork.data.push(assembler.program.length);
					compileSeq(seq);
					joins.push(assembler.jump([]));
				});
				joins.forEach(join => {
					join.data.push(assembler.program.length);
				});
			}
			compileRegex(ast);
		}

		// We can now define a simple helper to glue everything together
		function compileRegexVM(regex: string, recordMissing: boolean): VM<string, string> {
			// Use the generated parser for a quick AST
			const ast = regexParser.parse(regex);

			// Compile the AST into a whynot VM
			return compileVM(assembler => {
				compile(assembler, ast, recordMissing);
				// Any threads that made it to the end of the program have successfully matched the
				// complete input and can be accepted.
				assembler.accept();
			});
		}

		// One more quick helper to pull full strings out of the trace trees generated by the VM
		// when it is recording its progression.
		function flattenRecordStrings(
			traces: Trace<string>[],
			head: string[] = [],
			flatRecords: string[] = []
		) {
			expect(traces).toBeInstanceOf(Array);

			// Generate combined strings for each trace in the array
			for (let i = 0, l = traces.length; i < l; ++i) {
				const trace = traces[i];

				// Combine the records found so far with those of this trace
				const combinedHead = trace.record === null ? head : [trace.record].concat(head);

				if (!trace.prefixes.length) {
					// Beginning of trace reached, add full record string
					flatRecords.push(combinedHead.join(''));
				} else {
					// Recurse into prefixes
					flattenRecordStrings(trace.prefixes, combinedHead, flatRecords);
				}
			}
			return flatRecords;
		}

		it('can perform simple matching', () => {
			// If a VM can detect how to fix a string, it should first be able to tell if it was
			// broken in the first place. Executing the plain program should do just that. If it
			// returns any traces, these represent how the program was able to match the input. If
			// it doesn't, the input did not match in any way.
			const vm = compileRegexVM('abc(d|e)f', false);

			// This regex should match the string 'abcdf'
			const matchingResult = vm.execute(Array.from('abcdf'));
			expect(matchingResult.success).toBe(true);
			expect(matchingResult.acceptingTraces.length).toBe(1);

			// But it won't match the string 'abcf'
			const failingResult = vm.execute(Array.from('abcf'));
			expect(failingResult.success).toBe(false);
			expect(failingResult.acceptingTraces.length).toBe(0);
		});

		it('can complete a string based on a regex', () => {
			// The real fun starts when you add the additional instructions to allow and detect
			// missing characters. Now the traces returned by the VM can tell you how to fix the
			// input, provided it can be fixed by adding more characters.
			const vm = compileRegexVM('(a|(bc))d(e|f)', true);
			// There are a few branches in this regex, we get different results based on which
			// choices we remove by adding characters to the input. For instance, 'ad' fixes the
			// first choice but not the second, so we get two results:
			expect(flattenRecordStrings(vm.execute(Array.from('ad')).acceptingTraces)).toEqual([
				'ade',
				'adf'
			]);
			// Fixing both choices yields only a single result:
			expect(flattenRecordStrings(vm.execute(Array.from('bf')).acceptingTraces)).toEqual([
				'bcdf'
			]);
			// While leaving both open generates all strings accepted by the regex:
			expect(flattenRecordStrings(vm.execute(Array.from('d')).acceptingTraces)).toEqual([
				'ade',
				'bcde',
				'adf',
				'bcdf'
			]);
			// Finally, presenting an input which can not be made to match by adding
			// characters yields no results:
			expect(flattenRecordStrings(vm.execute(Array.from('abc')).acceptingTraces)).toEqual([]);
		});
	});

	// As well as telling you why a string does not match a certain language, whynot.js can, to an
	// extent, predict extensions to the input string also matching the language
	describe('regular expression exploration', () => {
		// We have generated a very simple parser using PEG.js for the subset of regular expressions
		// consisting of character matches (a-z, lower case), sequences, choices ("|"), Kleene star
		// ("*") and grouping using parentheses ("(" and ")"). This will create an AST as a set of
		// nested arrays, starting with the type of AST node, followed by its children. The compile
		// function traverses the AST recursively and generates a whynot program using the provided
		// assembler.
		type ExplorationRecord = { isExploration: boolean; input: string };
		function compile(
			assembler: Assembler<string, ExplorationRecord>,
			ast: regexParser.RegEx,
			recordingMode: boolean
		) {
			function compileTest(ast: regexParser.Test, recordingMode: boolean) {
				// A test represents an expected character, e.g., /a/
				if (!recordingMode) {
					// Normally, it is simply represented by a test instruction which fails if
					// the input character is not the expected character.
					assembler.test(function(input) {
						return input == ast.value;
					});
					assembler.record(null, () => {
						return {
							isExploration: false,
							input: ast.value
						};
					});
					return;
				}

				assembler.record(null, () => {
					return {
						isExploration: true,
						input: ast.value
					};
				});
			}
			function compileAtom(ast: regexParser.Atom, recordingMode: boolean) {
				switch (ast.type) {
					case 'choice':
						return compileRegex(ast, recordingMode);

					case 'test':
						return compileTest(ast, recordingMode);
				}
			}
			function compileQuantified(ast: regexParser.Quantified, recordingMode: boolean) {
				if (ast.type !== 'repetition') {
					return compileAtom(ast, recordingMode);
				}

				// Kleene star: Unbounded Repetition, e.g., /a*/
				// These are represented in the VM by looping over them.

				if (!recordingMode) {
					// For exploration, they are:
					//  - a recording part
					//  - a testing part, providing a return
					//  - another recording part.
					//  - A jump back to the first testing part

					// Record the possible insertion of this character at 0
					compileAtom(ast.value, true);

					const start = assembler.program.length;
					const join = assembler.jump([]);
					join.data.push(assembler.program.length);

					// Test for the existing character at n
					compileAtom(ast.value, recordingMode);

					// Record the possible insertion at n + 1
					compileAtom(ast.value, true);

					assembler.jump([start]);

					join.data.push(assembler.program.length);

					return;
				}

				// In recording mode, the recording of a* is the same as the recording of a
				// single a. this optimizes the program length of a language with star height >
				// 1 significantly.
				compileAtom(ast.value, true);
			}
			function compileSeq(ast: regexParser.Seq, recordingMode: boolean) {
				// A sequence of characters and/or groups, e.g., /abc/
				// This is represented in the program by simply executing its parts in the
				// specified order.
				ast.forEach(ast => {
					compileQuantified(ast, recordingMode);
				});
			}
			function compileRegex(ast: regexParser.RegEx, recordingMode: boolean) {
				// Alternatives, e.g., /a|b|c/
				// These are represented in the VM by forking execution to all options in
				// parallel and merging the surviving threads afterwards.
				const fork = assembler.jump([]);
				type Instruction = typeof fork;
				const joins: Instruction[] = [];
				ast.value.forEach(seq => {
					fork.data.push(assembler.program.length);
					compileSeq(seq, recordingMode);
					joins.push(assembler.jump([]));
				});
				joins.forEach(join => {
					join.data.push(assembler.program.length);
				});
			}
			compileRegex(ast, recordingMode);
		}

		// We can now define a simple helper to glue everything together
		function compileRegexVM(
			regex: string,
			recordMissing: boolean
		): VM<string, ExplorationRecord> {
			// Use the generated parser for a quick AST
			const ast = regexParser.parse(regex);

			// Compile the AST into a whynot VM
			return compileVM(assembler => {
				compile(assembler, ast, recordMissing);
				// Any threads that made it to the end of the program have successfully matched the
				// complete input and can be accepted.
				assembler.accept();
			});
		}

		// One more quick helper to pull full strings out of the trace trees generated by the VM
		// when it is recording its progression.
		function flattenRecordStrings(
			traces: Trace<ExplorationRecord>[],
			head: string[] = [],
			flatRecords: string[] = []
		) {
			expect(traces).toBeInstanceOf(Array);

			function transformRecord(record: ExplorationRecord): string {
				return record.isExploration ? '[' + record.input + ']' : record.input;
			}

			// Generate combined strings for each trace in the array
			for (let i = 0, l = traces.length; i < l; ++i) {
				const trace = traces[i];

				// Combine the records found so far with those of this trace
				const combinedHead =
					trace.record === null ? head : [transformRecord(trace.record)].concat(head);

				if (!trace.prefixes.length) {
					// Beginning of trace reached, add full record string
					flatRecords.push(combinedHead.join(''));
				} else {
					// Recurse into prefixes
					flattenRecordStrings(trace.prefixes, combinedHead, flatRecords);
				}
			}
			return flatRecords;
		}

		it('can specify possible extensions to the inputted string of length 1', () => {
			// If a VM can detect how to fix a string, it should first be able to tell if it was
			// broken in the first place. Executing the plain program should do just that. If it
			// returns any traces, these represent how the program was able to match the input. If
			// it doesn't, the input did not match in any way.
			const vm = compileRegexVM('(a|b)*', false);

			// This regex should match the string 'a', and generate extensions '[a]a[a]', '[b]a[a]',
			// '[a]a[b]', '[b]a[b]'
			const matchingResult = vm.execute(Array.from('a'));
			expect(matchingResult.success).toBe(true);
			expect(matchingResult.acceptingTraces.length).toBe(1);

			expect(flattenRecordStrings(matchingResult.acceptingTraces)).toEqual([
				'[a]a[a]',
				'[b]a[a]',
				'[a]a[b]',
				'[b]a[b]'
			]);
		});

		it('can specify possible extensions to the inputted string of length 2', () => {
			// If a VM can detect how to fix a string, it should first be able to tell if it was
			// broken in the first place. Executing the plain program should do just that. If it
			// returns any traces, these represent how the program was able to match the input. If
			// it doesn't, the input did not match in any way.
			const vm = compileRegexVM('(a|b)*', false);

			// This regex should match the string 'aa', and generates all permutations of the string
			// [a|b]a[a|b]a[a|b]
			const matchingResult = vm.execute(Array.from('aa'));
			expect(matchingResult.success).toBe(true);
			expect(matchingResult.acceptingTraces.length).toBe(1);

			// Sort the results of the traces since the order should be undefined
			expect(flattenRecordStrings(matchingResult.acceptingTraces).sort()).toEqual([
				'[a]a[a]a[a]',
				'[a]a[a]a[b]',
				'[a]a[b]a[a]',
				'[a]a[b]a[b]',
				'[b]a[a]a[a]',
				'[b]a[a]a[b]',
				'[b]a[b]a[a]',
				'[b]a[b]a[b]'
			]);
		});

		it('can specify possible extensions to the inputted string of length 3, in a language with star-height 2', () => {
			// Test case: running through outer star once
			const vm = compileRegexVM('(a*b*c)*', false);

			// This regex should match the string 'abc', and generates all permutations of the
			// following string
			const matchingResult = vm.execute(Array.from('abc'));
			expect(matchingResult.success).toBe(true);
			expect(matchingResult.acceptingTraces.length).toBe(1);

			// Sort the results of the traces since the order should be undefined
			expect(flattenRecordStrings(matchingResult.acceptingTraces).sort()).toEqual([
				'[a][b][c][a]a[a][b]b[b]c[a][b][c]'
			]);

			// Note that the individual explorations are not schema-valid, though the string may be
			// completed using the previous example.
		});

		it('can specify possible extensions to the inputted string in a language with star-height 2, providing input that matches the outer star twice', () => {
			// Test case: running through the star twice
			const vm = compileRegexVM('(a*b*c)*', false);

			// This regex should match the string 'abc', and generates all permutations of the
			// following string
			const matchingResult = vm.execute(Array.from('aabbcaabbc'));
			expect(matchingResult.success).toBe(true);
			expect(matchingResult.acceptingTraces.length).toBe(1);

			// Sort the results of the traces since the order should be undefined
			expect(flattenRecordStrings(matchingResult.acceptingTraces).sort()).toEqual([
				'[a][b][c][a]a[a]a[a][b]b[b]b[b]c[a][b][c][a]a[a]a[a][b]b[b]b[b]c[a][b][c]'
			]);

			// Note that the individual explorations may not all be schema-valid, though the string
			// may be completed using the previous example.
		});

		it('can specify possible extensions to the inputted string in a language with star-height 2, providing input matching the outer star thrice', () => {
			// Test case: running through the star thrice
			const vm = compileRegexVM('(a*b*c)*', false);

			// This regex should match the string 'abc', and generates all permutations of the
			// following string
			const matchingResult = vm.execute(Array.from('aabbcaabbcaabbc'));
			expect(matchingResult.success).toBe(true);
			expect(matchingResult.acceptingTraces.length).toBe(1);

			// Sort the results of the traces as the order should be undefined
			expect(flattenRecordStrings(matchingResult.acceptingTraces).sort()).toEqual([
				'[a][b][c][a]a[a]a[a][b]b[b]b[b]c[a][b][c][a]a[a]a[a][b]b[b]b[b]c[a][b][c][a]a[a]a[a][b]b[b]b[b]c[a][b][c]'
			]);

			// Note that the individual explorations may not all be schema-valid, though the string
			// may be completed using the previous example.
		});
	});

	describe('greediness using badness', () => {
		it('provides ordering on badness over joined threads: greedy to start', () => {
			const vm = compileVM<string, number>(assembler => {
				// As a regex: roughly A*(.*), with the latter group in non-greedy capturing mode
				// Aims to match AAABBB to AAA(BBB) as opposed to either (AAABBB), A(AABBB),
				// AA(ABBB), AAA(BBB)

				// A*
				const startIndex = 0;
				const start = assembler.jump([]);
				start.data.push(assembler.program.length);
				assembler.test(input => input === 'A');
				const endOfStar = assembler.jump([startIndex]);
				start.data.push(assembler.program.length);

				// Record position, to make a start of the CG
				assembler.record({}, (_, index) => index);

				// .*, non-greedy
				const start2Index = assembler.program.length;
				const start2 = assembler.jump([]);
				start2.data.push(assembler.program.length);
				assembler.bad();
				assembler.test(_input => true);
				assembler.jump([start2Index]);
				start2.data.push(assembler.program.length);

				// Done
				assembler.accept();
			});

			const result = vm.execute(['A', 'A', 'A', 'B', 'B', 'B']);
			//                          0    1    2    3    4    5    6
			//                                         '--- Expect CG to start here
			expect(result.success).toBe(true);
			const firstRecord = (function findFirstRecord(trace: Trace<number>): number {
				if (trace.record !== null) {
					return trace.record;
				}

				return findFirstRecord(trace.prefixes[0]);
			})(result.acceptingTraces[0]);
			expect(firstRecord).toBe(3);
		});

		it('provides ordering on badness over joined threads, greedy to end', () => {
			const vm = compileVM<string, number>(assembler => {
				// As a regex: roughly .*(A*), with the latter group in non-greedy capturing mode
				// Aims to match BBBAAA to (BBB)AAA as opposed to either (BBBAAA), B(BBAA),
				// BB(BAAA), (BBB)AAA

				// .*, non-greedy
				const start2Index = assembler.program.length;
				const start2 = assembler.jump([]);
				start2.data.push(assembler.program.length);
				assembler.bad();
				assembler.test(function(input: string) {
					return true;
				});
				assembler.jump([start2Index]);
				start2.data.push(assembler.program.length);

				// Record position, to make a start of the CG
				assembler.record({}, function(_, index) {
					return index;
				});

				// A*
				const startIndex = assembler.program.length;
				const start = assembler.jump([]);
				start.data.push(assembler.program.length);
				assembler.test(function(input) {
					return input === 'A';
				});
				const endOfStar = assembler.jump([startIndex]);
				start.data.push(assembler.program.length);

				// Done
				assembler.accept();
			});

			const result = vm.execute(['B', 'B', 'B', 'A', 'A', 'A']);
			//                          0    1    2    3    4    5    6
			//                                         '--- Expect CG to start here
			const firstRecord = (function findFirstRecord(trace: Trace<number>): number {
				if (trace.record !== null) {
					return trace.record;
				}

				return findFirstRecord(trace.prefixes[0]);
			})(result.acceptingTraces[0]);
			expect(firstRecord).toBe(3);
		});
	});
});