1. /**
* @mainpage
* @anchor mainpage
* @brief
* @details
* @copyright Russell John Childs, PhD, 2016
* @author Russell John Childs, PhD
* @date 2016-05-07
*
* This file contains classes: ExtendedRegExp, Parser
*
* Problem statement:
* You are given a dictionary (dictionary.txt), containing a list of words, one
* per line. Imagine you have seven tiles. Each tile is either blank or contains
* a single lowercase letter (a-z).
*
* Please list all the words from the dictionary that can be produced by using
* some or all of the seven tiles, in any order. A blank tile is a wildcard,
* and can be used in place of any letter.
*
* Try to use a minimal amount of memory.
*
* 1. Find all of the words that can be formed if you don't have to deal with
* blank tiles. (You may skip this step and go straight to step 2).
*
* 2. Find all of the words that can be formed, including those where blank
* tiles are used as wildcards.
*
* 3. Would you do things differently if you had to process several hundred
* tile sets with the same dictionary?
*
* Expectations:
*
* a) Please write down the reasoning or the explanation behind your solution in
* plain English or pseudo-code.
*
* b) Please provide the source code of your implementation. Only 1 and 2 need
* source code.
*
* c) Please include instructions on how to compile and run your code.
*
* d) Bonus points for source code in C/C++/C#.
*
*
* Solution: Use a bucket sort array,
* e.g. "bbbaa " -> bsa[0]=2, bsa[1]=3, bsa[size]=4;
* Iterate through chars in string to be matched:
*
* (1) If chr != wildcard, decrement bsa[chr] iff bsa[chr] > 0
*
* (2) if chr == wildcard or bsa[chr]<0, decrement bsa[size]
*
* (3) if bsa[size] < 0, there is no match.
*
* Specifications:
*
* (1) C shall denote a range of contiguous ASCII characters
*and shall defalut to ['a','z'].
*[This is the specification relating to "tiles"]
*
* (2) W shall denote a "wildcard" character, W.
* [This is the specification relating to "blank tiles"]
*
* (3) The user shall use the single-space ' ' for W.
* [This is the specification relating to "blank tiles"]
* (4) The system shall maintain an internal value for
* W of char(127) so that it is greater than the non-wildcard characters.
*
* (5) R shall denote the regular expression [CW]{n}, where n shall be
2. * specified by the user and default to 7
* [This is the specification relating to "7 tiles"]
*
* (6) S shall denote the set of all permutations of R,
* i.e. the set of all regular expressions that may be formed by
* permuting the characters in R.
*
(7) D shall be a set of strings delimited by the newline character.
* [This is the specification relating to "dictionary"]
*
* (8) No string in D shall contain the wildcard character ' ',
* i.e. no string shall contain a single space.
*
* (9) The system shall list all strings from D for which a match against
* any element in the set S, or a substr of the element,
* exists, in the order in which they appear in D.
* [This is the specification relating to requirements (1) and (2). (1) & (2)
* may be reduced to 1 requirement by deleting " " in regex,
* eg: "abc" <--> "a b c"]
*
* (10) The list specified in (8) shall be returned one string at a time and
* shall not be stored as an internal list of matching strings.
* [This is the specification relating to "minimal memory"]
*
* (11) Matching on a string in D shall be O(n) in complexity.
* [This is the specification relating to requirement (3)]
*
*
* (12) [Next release]. The sytem shall be multithreaded and shall divide
* dictionary.tmp into thread data and shall implement a multithreaded
* bucket-sort [This is the specification relating to requirement (3)].
*
* Compiled and tested under Linux Mint, using g++ 4.8.
*
* g++ options: -O0 -g3 -Wall -O0 -fopenmp -mavx -m64 -g -Wall -c
* -fmessage-length=0 -fno-omit-frame-pointer --fast-math
* -std=c++11 -I/opt/intel/vtune_amplifier_xe_2013/include/
*
* Linker options: -Wl,--no-as-needed -fopenmp
* -L/opt/intel/vtune_amplifier_xe_2013/lib64/
*
* Cmd line options: -lpthread -latomic -littnotify -ldl
*
* Documentation: Doxygen comments for interfaces, normal for impl.
*
* Usage: Place this cpp and unit_test.hpp in the same directory and compile.
* After compiling, run and specify choices at the prompts:
*
* "Please specify the pattern to be matched"
*
* "Please specify the fully-qualified dictionary filename"
*
* "Please specify the fully-qualified filename for the results"
*
* The binary will send:
* n
* (1) test results to stdout
*
* (2) results to results file
*
* (3) Temporary dictionary file to "./tmp_dictionary"
*
* Inputs and outputs:
*
* Input: dictionary file
*
* Input/output: tmp_dictionary file
*
* Output: results file
3. *
* The file unit_test.hpp is required for the tests and must be requested from
* author.
*
* @file dimensional_mechanics.cpp
* @see
* @ref mainpage
*/
#include <string>
#include <fstream>
#include <regex>
#include <algorithm>
#include<set>
#include<random>
#include "unit_test.hpp"
//Unit test framework is written for Linux. This #define ports it to
//Visual Studio 2013
//#define __PRETTY_FUNCTION__ __FUNCSIG__
/**
* addtogroup RegexpProblem
* @{
*/
namespace RegexpProblem
{
/**
* This class implements an O(n) algorithm that determines whether a match can
* be found between any permutation of the characters in a regular expression
*(including wildcard characters).
*
* The algorithm uses bucket sort array that counts the number of occurrences of
* each char in the regex, with the number of wildcards recorded at the end
* of the array.
*
* Example: ".a.bb." - bsa[0]=1, bsa[1]=2, bsa[sizeof(bsa)]=3
*
* Having built the array, the string to be matched is examined.
* Each char found in the string is used to decrement the corresponding
* entry in the array. Example "a" -> bsa[1]=2 ---> bsa[1]=1
*
* When a particular entry is 0, then the entry for the wildcard is decremented
* instead to signify it is being used in place of the character.
*
* Finally, if the entry for the wildcard goes negative, it must mean there is
* no match.
*
* This class is intended to be a replaceable helper class that supplements
* the std::regex lib.
*
*/
class ExtendedRegExp
{
public:
/**
* @param regexp {const std::string&} - A regex, e.g. ".a.b.c"
* @param wildcard {char} - Thr char used as a wildcard, e.g. ' ', '.', '?'
* @param bucket_sort_size {unsigned} - The size for the bucket sort array
*/
ExtendedRegExp(const std::string& regexp, char wildcard=' ',
unsigned bucket_sort_size=28) try :
m_regexp(regexp),
m_buckets(bucket_sort_size, 0),
m_wildcard(127),
m_begin(127)
{
//Initialise bucket sort
4. std::replace(m_regexp.begin(), m_regexp.end(), ' ', char(127));
m_begin = *std::min_element(m_regexp.begin(), m_regexp.end());
for (auto chr : m_regexp)
{
++m_buckets[chr != m_wildcard ? index(chr) : bucket_sort_size - 1];
}
}
catch (std::exception& except)
{
//Print any excpetion thrown
std::cout << "Exception initialising bucket sort in constructor: "
<< except.what()
<< std::endl;
}
/**
* Dtor
*/
~ExtendedRegExp(void)
{
}
/**
* This returns the index of a char relative to the smallest char in the
* regex
*
* Example: If the smallest char in the regex is 'a' then the index
* of 'c' will be 3
*
* @param chr {char} - the char whose relative "ascii" index is to be found
*
* @return {unsigned} - The relative index
*/
unsigned index(char chr)
{
return unsigned((unsigned char)chr) - unsigned((unsigned char)m_begin);
};
/**
* This matches a string against all permutations of the regex used to
* initialise the class object. It uses the bucket sort array described
* in the documentation for this class. Blank strings are a match and
* strings matched against a substr of regex are also a match.
*
* @param word {const std::string&} - the char whose relative "ascii" index
* is to be found
*
* @return {unsigned} - The relative index
*/
bool operator==(const std::string& word)
{
bool ret_val = false;
auto buckets = m_buckets;
auto size = buckets.size();
//Only consider words short enough to be spanned by regecp
auto len = word.length();
if ((0 < len) && (len <= m_regexp.length()))
{
//Loop over chars in word
for (auto chr : word)
{
//Decrement corresponding non-wildcard count in regexcp
if ((index(chr) < (size-1)) && (buckets[index(chr)]>0))
{
//Decrement char count
ret_val = ((--buckets[index(chr)]) >= 0);
}
else //use wildcard if 0 non-wildcards left or not non-wildcard
{
5. //Decrement wildcard count
ret_val = ((--buckets[size-1]) >= 0);
}
//Only continue if we have not encountered a non-match
if (ret_val == false)
{
break;
}
}
}
return ((len > 0) ? ret_val : true);
}
/**
* This returns the length of the regex used to initialise this class object.
*
* @return {unsigned} - The length of the rexgex.
*/
unsigned size(void)
{
return m_regexp.size();
}
private:
std::string m_regexp;
std::vector<int> m_buckets;
char m_wildcard;
char m_begin;
};
/**
* This class iterates over the lines in a dictionary file seeking those that
* match the regegular epxression provided.
*
* @tparam RegExp - The regular expression matching engine to be used.
* The default is ExtendedRegExp. Any user-defined class must provide the
* same public interface as ExtendedRegExp.
*
*/
template<typename RegExp = ExtendedRegExp >
class Parser
{
public:
/**
* This defines the "no-match" string pattern.
*
* @return {const std::string&} - The "no-match" pattern.
*/
static const std::string& no_match(void)
{
static const std::string no_match("!£$%^&&^%$££$%^&");
return no_match;
}
/**
* @param dictionary {const std::string&} - The dictionary file to be parsed
* @param regexp {const RegExp&} - The regular expression to be used for
* matching
*/
Parser(const std::string& dictionary, const RegExp& regexp) :
m_dictionary(dictionary),
m_regexp(regexp)
{
}
/**
* Dtor
6. */
~Parser(void)
{
}
/**
* This resets the dictionary file and regular expression.
*
* @param dictionary {const std::string&} - The dictionary file to be parsed
* @param regexp {const RegExp&} - The regular expression to be used for
* matching
*/
void reset(const std::string& dictionary, const RegExp& regexp)
{
m_dictionary.close();
m_dictionary = std::ifstream(dictionary);
m_regexp = regexp;
}
/**
* This returns the next line in the dictionary file that is successfully
* matched against the regular expression used to initiliase this class
* object.
*
* @return {std::string} - The next line matched
*/
std::string get_next_match(void)
{
//Buffer for dictionary string and return value
std::string ret_val = no_match();
bool ret = false;
//Get length of regexp
unsigned length = m_regexp.size();
//Verify file is good
if (m_dictionary && (m_dictionary.eof() == false))
{
//Loop over strings in file until next match is found
std::getline(m_dictionary, ret_val);
ret = (m_regexp == ret_val);
while ((ret == false) && (m_dictionary.eof() == false))
{
std::getline(m_dictionary, ret_val);
ret = (m_regexp == ret_val);
}
}
//Return the match
return ret ? ret_val : no_match();
}
private:
std::ifstream m_dictionary;
RegExp m_regexp;
};
}
/**
* @}
*/
/**
* addtogroup Tests
* @{
*/
namespace Tests
{
/**
7. * Wrapper class for std::vector converting {a, b, c, ...} to "a b c ..."
*/
struct PrintVector
{
PrintVector(const std::vector<std::string>& vec) :
m_vec(vec)
{
}
std::string str()
{
std::stringstream ss;
for (auto elem : m_vec)
{
ss << elem << " ";
}
return ss.str();
}
std::vector<std::string> m_vec;
};
/**
*
* =========
*
* Test plan
*
* =========
*
* 1. Extract n random lines from dictionary.txt, store in tmp_dictionary.txt.
* n=500.
*
* 2. Randomly select k=0<= k < tmp_dictionary.size for several values of k
*
* 3. For each k, extract kth line from tmp_dictionary.txt and use as
* pattern to ne matched (regex)
*
* 4. For each k, replace 0, 1, 2, ... all chars in regex with " ",
* to test wildcard substitution to give regex'.
*
* 5. For each regex' extract all matching lines in tmp_dictionary and
* store in "results" vector.
*
* 6. For each regex' iterate over all permutations of the characters.
*
* 7 For each permutation of regex' extract all matching lines from
* tmp_dictionary.txt using std::regex_match and store in "control" vector.
*
* 8. Validate that results ==control.
*
* NB: It is unnecessary to consider all permutations of locations to
* place the wildcards, (e.g. "..aa", ".a.a"), since the test already considers
* all permutations of regex and thus covers this equivalence partition.
*
* @param dictionary_file {const std::string&} - Fully-qualified filename
* for dictionary
*
*/
void tests(const std::string& dictionary_file)
{
typedef std::vector<std::string> sv;
//Namespaces for unit test framework and parser.
using namespace UnitTest;
using namespace RegexpProblem;
//Vectors for results of parser algorithm
// "control" generated by exhaustive search
sv results;
sv control;
8. //The regexp to be matched and the matching lines from dictionary
std::string regexp;
std::string match;
//Lambda to access file as file(line_number)
std::string buffer;
unsigned counter = 0;
auto get_line = [&](std::fstream& stream, unsigned index)
{
while (counter != index)
{
std::getline(stream, buffer);
++counter;
}
return buffer;
};
//Create small dictionary with random entries from original to speed tests.
//Create set of random numbers to extract random lines from dictionary file
const unsigned tmp_dictionary_size = 500;
std::fstream dictionary(dictionary_file);
std::fstream tmp_dictionary("tmp_dictionary.txt");
auto size = std::count(std::istreambuf_iterator<char>(dictionary),
std::istreambuf_iterator<char>(), 'n');
dictionary.seekg(0);
std::set<unsigned> random_lines;
//std::random_device dev;
std::mt19937 generator(0);
std::uniform_int_distribution<> distr(1, size);
for (int i = 1; i <= tmp_dictionary_size; ++i)
{
random_lines.insert(distr(generator));
}
//Create temporary dictionary file using the random numbers
for (auto random_line : random_lines)
{
get_line(dictionary, random_line);
if (counter == random_line)
{
tmp_dictionary << buffer << std::endl;
}
}
//Lmabda to perform exhaustive matching over all permuations of regex
//to verify algorithm
auto exhaustive_match = [&](std::string regexp)
{
std::cout << std::endl
<< "Verification requires exhaustive search, please wait";
//Loop over dictionary words while file is good
while (tmp_dictionary && (tmp_dictionary.eof() == false))
{
std::cout << ".";
std::string buffer;
std::getline(tmp_dictionary, buffer);
std::string padded = (buffer.length() >= regexp.length() ? buffer :
buffer + regexp.substr(buffer.length(), regexp.length()));
std::string truncated = regexp.substr(0, buffer.length());
bool is_found = std::regex_match(buffer, std::regex(truncated));
//Sort regexp and remove duplicate chars for std::next_permuation
// (it took me two hours to track down this this silly bug).
unsigned count = 126;
std::string tmp = regexp;
for (auto& chr : tmp)
{
if (chr == '.')
9. {
chr = count;
--count;
}
}
std::sort(tmp.begin(), tmp.end());
//Loop over permutations of regex until match found or exhausted
while ((buffer.length() <= regexp.length()) &&
(is_found == false) &&
std::next_permutation(tmp.begin(), tmp.end()))
{
//Undo the elimination of duplicates for std::next_permutation
//and put them back in
std::string tmp_1 = tmp;
std::replace_if(tmp_1.begin(), tmp_1.end(),
[&](char chr) {return (chr > count); }, '.');
std::string truncated = tmp_1.substr(0, buffer.length());
is_found = std::regex_match(buffer, std::regex(truncated));
}
//Add matches to list of "control" values used for verification
if (is_found)
{
control.push_back(buffer);
}
}
std::cout << std::endl;
};
//Create instances of parser with regexp = tmp_dictionary[random]
unsigned trials = 1;
for (unsigned trial = 1; trial <= trials; ++trial)
{
//Reset dictionary
tmp_dictionary.seekg(0);
counter = 0;
//Get radnwom line from dictionary to act as regexp
std::uniform_int_distribution<> rand_distr(1, tmp_dictionary_size);
unsigned random = rand_distr(generator);
regexp = get_line(tmp_dictionary, random);
//Loop over num of wildcards to use (0 to all chars).
//NB All-wildcards should match all strings of length=regexp.length
auto num_wildcards = regexp.size();
for (unsigned num = 0; num <= num_wildcards; ++num)
{
results.clear();
control.clear();
//Replace relevant char with wildcard
if (num > 0)
{
regexp.replace(num - 1, 1, " ");
}
//Reset dictionary
//tmp_dictionary.seekg(0);
tmp_dictionary.close();
counter = 0;
//Create parser and loop over matches found
Parser<> parser("tmp_dictionary.txt", ExtendedRegExp(regexp, ' '));
while ((match = parser.get_next_match()) != Parser<>::no_match())
{
results.push_back(match);
}
//Perform exhaustive match search for verification
tmp_dictionary.close();
tmp_dictionary.open("tmp_dictionary.txt");
counter = 0;
//for regex - replace ' ' with '.' as wildcard
10. auto tmp_regexp = regexp;
//std::cout << "regex before= " << regexp << std::endl;
std::replace(tmp_regexp.begin(), tmp_regexp.end(), ' ', '.');
//std::cout << "regex after= " << regexp << std::endl;
exhaustive_match(tmp_regexp);
//Verify algorithm against exhaustive match search
VERIFY(std::string("Regexp = ") + """ + tmp_regexp + """,
PrintVector(results).str()) == PrintVector(control).str();
}
}
}
}
/**
* @}
*/
int main(void)
{
using namespace Tests;
using namespace UnitTest;
using namespace RegexpProblem;
/*
//This struct pauses at the end of tests to print out results
struct BreakPointAfterMainExits
{
BreakPointAfterMainExits(void)
{
static BreakPointAfterMainExits tmp;
}
~BreakPointAfterMainExits(void)
{
unsigned set_bp_here_for_test_results = 0;
}
} dummy;
*/
std::string pattern;
std::cout << "Please specify the pattern to be matched" << std::endl;
std::getline(std::cin, pattern, 'n');
std::cout << pattern << std::endl;
std::string dictionary;
std::cout << "Please specify the fully-qualified dictionary filename"
<< std::endl;
std::cin >> dictionary;
std::cout << dictionary << std::endl;
std::string results_file;
std::cout << "Please specify the fully-qualified filename for the results"
<< std::endl;
std::cin >> results_file;
std::cout << results_file << std::endl;
//Run tests
char yes_no;
std::cout << "Do you wish to run the tests (they run slowly on Windows "
<< "and quickly under Linux)? - (y/n):";
std::cin >> yes_no;
if (yes_no == 'y' || yes_no == 'Y')
{
tests(dictionary);
Verify<Results> results;
}
//Match dictionary.tmp against " carnage", send results to results.txt
std::cout << "matching strings in dictionary.txt against " "
11. << pattern
<< " " "
<< " Results will be sent to "
<< results_file
<< ". Please wait ..."
<< std::endl;
Parser<> parser(dictionary, ExtendedRegExp(pattern, ' '));
std::string match;
std::ofstream output(results_file);
while ((match = parser.get_next_match()) != Parser<>::no_match())
{
output << match << std::endl;
}
return 0;
}