#include #include #include #include /* * This program reads a file containing a "corpus" of text and computes the * monographic and digraphic probabilities derived from the counts of letters * and pairs of letters in the corpus file. The output is in the format of * arrays of probabilities suitable for use in other C programs. The first * array, monoprobs[0..25] contains the probability of seeing the corresponding * letter in the text file (0=A, 1=B, 2=C, ..., 25=Z). The second array, * diprobs[0..25][0..25] contains the table of probabilities of seeing the jth * character immediately following the ith character, that is * * diprobs[i][j] = Probability ((n+1)-st character is j, given that the nth * character is i) * * To run this program, run "probs filename", where probs.exe is the compiled * name of this program and filename is the name of a file containing a corpus * of text in whatever language you want (I assume one based on the Roman * alphabet, like English). The corpus file should have, only as a rule of * thumb, 1-2 million characters. * * This program was compiled with Borland Turbo C++ 4.5. * * Copyright 27 September 2000 by Lee A. Taylor */ /* * These 3 functions are available on most systems in a standard library. */ #define ISALPHA(c) (ISLOWER(c) || ISUPPER(c)) #define ISLOWER(c) (((c) >= 'a') && ((c) <= 'z')) #define ISUPPER(c) (((c) >= 'A') && ((c) <= 'Z')) /* * The maximum size of a buffer of characters to be read from the corpus * file. Kind of sloppy, but if there is plenty of text in the corpus file, * it doesn't really matter. */ #define BUFSIZE 1024 /* * See above comments for a description of these arrays of probabilities. */ long double monoprobs[26]; long double diprobs[26][26]; main (int argc, char *argv[]) { char buf[BUFSIZE]; /* Buffer to hold input characters from corpus. */ FILE *fp; long int dicount[26], /* dicount[i]=# of times character i is first character in a pair of valid alphabetic characters used in a digraph. */ monocount; /* number of valid characters read in the corpus */ int fd, /* file descriptor of the corpus file */ i, j, nbytes; /* number of bytes read from the corpus file */ /* * Spit out a message if the user doesn't specify a valid file name on the * command line. */ if (argc != 2) { printf ("%s: Usage is %s plaintextfile\n", argv[0], argv[0]); exit (-1); } fd = open (argv[1], 0); if (fd < 0) { printf ("%s: can't open file %s\n", argv[0], argv[1]); exit (-1); } fp = fopen ("probs.h", "w"); /* * Set all the probabilities and counts to 0. */ monocount = 0; for (i = 0; i < 26; i++) { monoprobs[i] = 0.0; dicount[i] = 0; for (j = 0; j < 26; j++) diprobs[i][j] = 0.0; } while ((nbytes = read (fd, buf, BUFSIZE)) > 0) { /* * Go through the input buffer and increment the monographic count for * each lower case and upper case letter. Ignore the rest. */ for (i = 0; i < nbytes; i++) if (ISALPHA (buf[i])) { monocount++; if (ISLOWER (buf[i])) monoprobs[buf[i] - 'a'] += 1.0; else monoprobs[buf[i] - 'A'] += 1.0; } /* * Go through the input buffer and increment the digraph count for each * pair of alphabetic characters that appear right next to each other. * It does NOT count characters across word boundaries (e.g., the ES * from the end of the word THE and the beginning of the word SOUTH in * THE SOUTH is NOT counted). It is also a little sloppy because a valid * letter pair where the first character is at the end of the current * buffer from the file and the second is at the beginning of the NEXT * buffer from the file is also NOT counted. This should not matter if * there is plenty of text in the file. */ for (i = 0; i < nbytes - 1; i++) if (ISALPHA(buf[i]) && ISALPHA(buf[i + 1])) { if (ISLOWER(buf[i])) buf[i] -= 'a'; else buf[i] -= 'A'; dicount[buf[i]]++; if (ISLOWER(buf[i + 1])) diprobs[buf[i]][buf[i + 1] - 'a'] += 1.0; else diprobs[buf[i]][buf[i + 1] - 'A'] += 1.0; } } close (fd); /* * Now go through and convert the counts to probabilities and print the * formatted C arrays out to stdout. */ fprintf (fp, "double monoprobs[26] = {"); for (i = 0; i < 25; i++) fprintf (fp, "%10.8Lf, ", monoprobs[i] / ((long double)monocount)); fprintf (fp, "%10.8Lf };\n", monoprobs[25] / ((long double)monocount)); fprintf (fp, "double diprobs[26][26] = {\n"); for (i = 0; i < 25; i++) { for (j = 0; j < 26; j++) fprintf (fp, "%10.8Lf, ", diprobs[i][j] / ((long double)dicount[i])); fprintf (fp, "\n"); } for (j = 0; j < 25; j++) fprintf (fp, "%10.8Lf, ", diprobs[25][j] / ((long double)dicount[25])); fprintf (fp, "%10.8Lf };\n", diprobs[25][25] / ((long double) dicount[25])); fclose (fp); }