#include #include #include #include #include // configuration #define DEBUG 0 #define PROCESSFILE 1 #define DICTIONARY_FILE "/var/tmp/twl06.txt" #define PROCESSED_FILE "processedfile" #define NUM_WORDS 178691 #define MAX_LEN 17 #define MAX_INPUT_LEN 100 #define abs(a) ((a) < 0 ? -(a) : (a)) #define isspace(c) (c == ' ' || c == '\t' || c == '\r' || c == '\n') char* wordlist[NUM_WORDS]; int wordlen[NUM_WORDS]; int startAt[NUM_WORDS]; char buffer[MAX_INPUT_LEN + 1]; #if DEBUG #define ASSERT(a, ...) do {if(!(a)){fprintf(stderr, "Error: Assert for '%s' returned false on line %d of %s\n", #a, __LINE__, __FILE__);exit(1);}} while(0) #define start() (startTime = prevTime = currentTimeMillis()) #define display() displayTime() unsigned long startTime; unsigned long prevTime; unsigned long currentTimeMillis() { struct timeval tv; gettimeofday(&tv, 0); return tv.tv_usec + tv.tv_sec * 1000000; } unsigned long displayTime() { unsigned long temp = currentTimeMillis(); fprintf(stderr, "%ld microseconds since start/%ld microseconds since previous\n", temp - startTime, temp - prevTime); prevTime = temp; return prevTime; } #else #define ASSERT(a, ...) 0 #define start() 0 #define display() 0 #endif int d[MAX_INPUT_LEN + 1][MAX_LEN + 1]; int i; int numSharedPrefix(const char* word1, const char* word2) { int i; for(i = 0; *word1 && *word1++ == *word2++; i++); return i; } #if PROCESSFILE void str_tolower(char *s) { int diff = 'a' - 'A'; while(*s) { if(*s >= 'A' && *s <= 'Z') { *s += diff; } s++; } } inline int myStrlen(char* str) { int index = 0; while(str[index] != '\n' && str[index] != '\r') { index++; } return index; } void produceWordList(char filename[], char outputfilename[]) { FILE* fp = fopen(filename, "r"); FILE* myout = fopen(outputfilename, "wb+"); char line[MAX_LEN + 2]; if(fp == NULL) { fprintf(stderr, "File (%s) can't be opened\n", filename); exit(1); } if(myout == NULL) { fprintf(stderr, "File (%s) can't be opened for writing\n", "processedinput"); exit(1); } int start = 0; for(int word = 0; fgets(line, MAX_LEN + 2, fp) != NULL; word++) { wordlen[word] = myStrlen(line); str_tolower(line); line[wordlen[word]] = 0; wordlist[word] = strcpy((char*)malloc(sizeof(char) * (wordlen[word] + 1)), line); fwrite(&wordlen[word], sizeof(int), 1, myout); // calculate startAt[word] if(word) { start = numSharedPrefix(wordlist[word], wordlist[word - 1]); } fwrite(&start, sizeof(int), 1, myout); fwrite(wordlist[word], sizeof(char), wordlen[word], myout); } fclose(fp); fclose(myout); } #endif void populateWordList(char* filename) { FILE* fp = fopen(filename, "rb"); if(fp == NULL) { fprintf(stderr, "Error opening %s...\n", filename); exit(1); } startAt[0] = 1; for(int word = 0; word < NUM_WORDS; word++) { fread(&wordlen[word], sizeof(int), 1, fp); fread(&startAt[word], sizeof(int), 1, fp); wordlist[word] = (char*)malloc(sizeof(char) * (wordlen[word] + 1)); fread(wordlist[word], sizeof(char), wordlen[word], fp); #if DEBUG == 2 printf("Word: '%s' (len = %d)\n", wordlist[word], wordlen[word]); #endif } fclose(fp); } /* int editDistance(char* str, int inputWordLen, int i) { int row, col, min, temp; int dictionaryWordLen = wordlen[i]; char* word = wordlist[i]; for(row = startAt[i]; row <= dictionaryWordLen; row++) { for(col = 1; col <= inputWordLen; col++) { min = (d[row - 1][col] < d[row][col - 1] ? d[row - 1][col] : d[row][col - 1]) + 1; temp = d[row - 1][col - 1] + (str[col - 1] != word[row - 1]); d[row][col] = min < temp ? min : temp; } } return d[dictionaryWordLen][inputWordLen]; }*/ int getMinEditDistance(char* str, int inputWordLen) { int min = inputWordLen; int row, col, minOf3, temp; int dictionaryWordLen; int prevID = -1; char* word; for(i = 0; i < NUM_WORDS; i++) { // compute edit distance dictionaryWordLen = wordlen[i]; if(abs(dictionaryWordLen - inputWordLen) < min) { word = wordlist[i]; for(row = (prevID == i - 1) ? startAt[i] : numSharedPrefix(wordlist[prevID], word); row < dictionaryWordLen; row++) { for(col = 0; col < inputWordLen; col++) { minOf3 = (d[row][col + 1] < d[row + 1][col] ? d[row][col + 1] : d[row + 1][col]) + 1; temp = d[row][col] + (str[col] != word[row]); d[row + 1][col + 1] = minOf3 < temp ? minOf3 : temp; } } temp = d[dictionaryWordLen][inputWordLen]; if(temp < min) { min = temp; } prevID = i; } } return min; } int getWord(FILE* fp) { int tmp; do { tmp = fgetc(fp); } while(isspace(tmp) && tmp != EOF); // if that first character is not EOF or null if(tmp && tmp != EOF) { // tmp is the first character that is not a space (or newline) buffer[0] = tmp; for(i = 1; i < MAX_INPUT_LEN; i++) { tmp = fgetc(fp); if(isspace(tmp) || tmp == '\0' || tmp == EOF) { break; } else { buffer[i] = tmp; } } buffer[i] = 0; #if DEBUG if(tmp && !isspace(tmp) && tmp != EOF) { fprintf(stderr, "Error. '%s' is too long", buffer); } #endif return i; } else { return 0; } } int main(int argc, char* argv[]) { #if PROCESSFILE if(argc == 1) { produceWordList(DICTIONARY_FILE, PROCESSED_FILE); exit(0); } #endif FILE* fp = fopen(argv[1], "r"); if(fp == NULL) { fprintf(stderr, "File (%s) cannot be opened\n", argv[1]); exit(1); } start(); populateWordList(PROCESSED_FILE); display(); for(i = 0; i <= MAX_INPUT_LEN; i++) { d[i][0] = i; } for(i = 0; i <= MAX_LEN; i++) { d[0][i] = i; } display(); int sum = 0; int inputWordLen; while(inputWordLen = getWord(fp)) { sum += getMinEditDistance(buffer, inputWordLen); } display(); printf("%d\n", sum); return 0; }