Here is the C-program that we use to create our lettersets:
#include <assert.h>
#include <stdio.h>
#include <stdarg.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fstrcmp.h>
#define MAXSETSEARCH 1000000
#define MAXWORDS 64000
#define charnum 31
#define aumlaut 26
#define oumlaut 27
#define uumlaut 28
#define szlig 29
#define apostroph 30
int setsize=12;
int homsize=2; // <- does not work correctly for larger values
int loglevel=2;
int maxset=0;
int setnum=0;
char *infilename="isowordlist";
int seed=123;
char *allWords=NULL;
long int wordNum=0;
long int wlSize=0;
char alphabet[charnum];
int approxPoints=5450;
int approxPointsRange=0;
char *testString=NULL;
char *oneWord=NULL;
typedef struct {
char * word;
int points;
short int require[charnum];
} wordtype ;
wordtype *words=NULL;
typedef struct {
char *string ;
short int require[charnum];
} settype ;
settype* set;
void addChar(char* c,short int *require,char *word) {
int j;
switch(*c) {
case 'ä': require[aumlaut]++;break;
case 'ö': require[oumlaut]++;break;
case 'ü': require[uumlaut]++;break;
case 'ß': require[szlig]++;break;
case '\'': require[apostroph]++;break;
default: j = *c-'a';
if (j<0 || j>charnum) {
printf("*** unknown character '%c' in %s ***\n",*c,word);
exit(1);
}
else require[j]++;break;
}
}
void initAlphabet () {
char c;
for (c='a';c<='z';++c) alphabet[c-'a']=c;
alphabet[aumlaut]='ä';
alphabet[oumlaut]='ö';
alphabet[uumlaut]='ü';
alphabet[szlig]='ß';
alphabet[apostroph]='\'';
}
void showWord(wordtype w) {
int i;
printf("%4d %10s ",w.points,w.word);
for (i=0;i<charnum;++i) printf("%c%d ",alphabet[i],w.require[i]);
printf("\n");
}
void showSet(settype w) {
int i;
printf("%15s ",w.string);
for (i=0;i<charnum;++i) printf("%c%d ",alphabet[i],w.require[i]);
printf("\n");
}
void require2char (short int *require,char *string) {
int i,j,l;
l=0;
for (i=0;i<charnum;++i)
for (j=0;j<require[i];++j) {
string[l++]=alphabet[i];
string[l]=0;
}
}
settype *newSet() {
int i,l;
settype *set;
long int pt;
char *c;
int count=0;
void startSet () {
*(*set).string = 0;
for (;;) {
pt = (long int) ( floor(((wlSize-homsize)*1.0) * (rand() * 1.0) / RAND_MAX ));
strncpy((*set).string,allWords+pt,homsize);
(*set).string[homsize]=0;
if (strlen((*set).string)==homsize) break;
}
}
set = malloc(sizeof(settype));
(*set).string = malloc( (setsize+1) * sizeof(char));
startSet();
if (testString) strcpy((*set).string,testString);
l= strlen((*set).string);
for (;l<setsize;) {
if (++count > MAXSETSEARCH) {
startSet();
l= strlen((*set).string);
}
pt = (long int) ( floor(((wlSize-1)*1.0) * (rand() * 1.0) / RAND_MAX ));
if (! strncmp( (*set).string + l - homsize + 1, allWords+pt, homsize - 1)) {
(*set).string[l] = *(pt+allWords+homsize);
(*set).string[l+1] = 0;
l = strlen((*set).string);
}
}
for (i=0;i<charnum;++i) (*set).require[i]=0;
for (c=(*set).string;*c != 0;++c) {
addChar(c,(*set).require,(*set).string);
}
require2char ( (*set).require, (*set).string );
return(set);
}
void freeSet(settype* set) {
free((*set).string);
free(set);
}
inline int feasible(short int *req,short int *provides) {
int i;
for (i=0;i<charnum;++i)
if (req[i]>provides[i]) return(i+1);
return(0);
}
void testSet(settype* set) {
int points=0;
double overlap;
long int i;
static long int wordlist[MAXWORDS];
double thisAlike,bestAlike,sumAlike,avAlike;
static long int wordlist2[MAXWORDS];
int lastword=0;
int lastword2=0;
int maxlen=0;
void compareWordlists(long int *w,long int *w2,int l,int l2) {
int i,j,c,thislen;
c=0;
maxlen=0;
sumAlike=0;
for (i=0;i<l;++i) {
bestAlike=0;
thislen = strlen(words[w[i]].word);
if (thislen>maxlen) maxlen=thislen;
for (j=0;j<l2;++j)
if (w[i]==w2[j]) ++c;
for (j=0;j<l;++j) {
thisAlike=fstrcmp(words[w[i]].word,words[w[j]].word);
if (i != j && thisAlike > bestAlike) bestAlike=thisAlike;
}
sumAlike += bestAlike;
}
overlap=(1.0*c)/l;
avAlike=sumAlike/l;
}
for (i=0;i<=wordNum;++i) {
if (oneWord && !strcmp(oneWord,words[i].word)) {
showWord(words[i]);
showSet(*set);
printf ("feasibility = %d\n",(feasible(words[i].require,(*set).require)));
}
if (! feasible(words[i].require,(*set).require)) {
if (lastword<MAXWORDS-1) wordlist[lastword++]=i;
points += words[i].points;
}
}
if (approxPointsRange>0 && abs(approxPoints-points)>approxPointsRange)
return;
++setnum;
compareWordlists(wordlist,wordlist2,lastword,lastword2);
if (loglevel>1) printf("%s,%d,%d,%g,%g,%d",(*set).string,points,lastword,overlap,avAlike,maxlen);
if (loglevel>2) {
printf(",\"");
for (i=0;i<lastword;++i) {
if(strlen(words[wordlist[i]].word)==maxlen) printf("*");
printf ("%s ",words[wordlist[i]].word);
wordlist2[i]=wordlist[i]; // copy wordlist do determine overlap
}
lastword2=lastword; // copy wordlist do determine overlap
printf("\"");
}
if (loglevel>1) printf("\n");
}
//--------------------------------------------------------------------------------
void readWords () {
struct stat ifstat;
char* c;
long int i;
int wlen=0;
FILE *ifptr;
void clearWord(int i,char *c) {
int j;
words[i].word=c;
words[i].points=0;
for (j=0;j<charnum;++j) words[i].require[j]=0;
}
wordNum=0;
if (stat(infilename, &ifstat)) {
printf("*** could not read file \"%s\" ***\n",infilename);
exit(1);
}
wlSize = ifstat.st_size;
if (loglevel>4) printf("size of %s is %ld\n",infilename,wlSize);
allWords = (char*) calloc(wlSize+2,sizeof(char));
assert(allWords);
if (loglevel>4) printf("reading file %s",infilename);
assert(ifptr = fopen(infilename,"r"));
assert(1==fread(allWords,wlSize, 1, ifptr));
fclose(ifptr);
if (loglevel>4) printf("...done\n");
/* first count of all words, convert EOL, convert to lower */
for (c=allWords;c<=allWords+wlSize;++c) {
switch(*c) {
case '\n': *c=0;++wordNum;break;
case 'Ä':*c='ä';break;
case 'Å':*c='a';break;
case 'Ö':*c='ö';break;
case 'Ü':*c='ü';break;
case 'á':*c='a';break;
case 'â':*c='a';break;
case 'å':*c='a';break;
case 'ç':*c='c';break;
case 'é':*c='e';break;
case 'è':*c='e';break;
case 'ê':*c='e';break;
case 'í':*c='i';break;
case 'ñ':*c='n';break;
case 'ó':*c='o';break;
case 'ô':*c='o';break;
case 'û':*c='u';break;
default : *c = tolower(*c);break;
}
}
/* drop unnecessary line breaks at the end */
while (*(allWords+wlSize)==0) {--wlSize;--wordNum;}
++wlSize;
++wordNum;
assert(words = malloc((wordNum+2) * sizeof(wordtype)));
/* get points to all words, determine points */
wlen = 0;
i=0;
clearWord(0,allWords);
for (c=allWords;c<=allWords+wlSize;++c) {
++wlen;
if (*c==0) {
words[i].points=(wlen * (wlen-1))/2;
clearWord(++i,c+1);
wlen=0;
continue;
}
addChar(c,words[i].require,words[i].word);
}
wordNum = i-1;
if (loglevel>4) {
printf("we have %ld words\n",
wordNum+1);
if (wordNum>5) {
for (i=0;i<5;++i) {
showWord(words[i]);
}
printf ("...\n");
for (i=wordNum-5;i<=wordNum;++i) {
showWord(words[i]);
}
}
}
}
int main (argc, argv)
int argc;
char **argv;
{
char c;
while (1) {
c = getopt (argc, argv, "a:A:i:I:l:r:s:T:M:");
if (c == -1)
break;
switch (c)
{
case 'a':
sscanf(optarg,"%d",&approxPoints);
break;
case 'A':
sscanf(optarg,"%d",&approxPointsRange);
break;
case 'i':
infilename = optarg;
break;
case 'I':
oneWord = optarg;
break;
case 'l':
sscanf(optarg,"%d",&loglevel);
break;
case 'M':
sscanf(optarg,"%d",&maxset);
break;
case 's':
sscanf(optarg,"%d",&setsize);
break;
case 'r':
sscanf(optarg,"%d",&seed);
break;
case 'T':
testString = optarg;
maxset = 1;
break;
default:
printf ("letters [options]\n");
printf ("-a approx. points [%d]\n",approxPoints);
printf ("-A approx. points range [%d]\n",approxPointsRange);
printf ("-i `name of letterfile [%s]'\n",infilename);
printf ("-I one Word (and no file) [%s]\n",oneWord);
printf ("-l loglevel [%d]\n",loglevel);
printf ("-M loglevel [%d]\n",maxset);
printf ("-r seed [%d]\n",seed);
printf ("-s size of letterset [%d]\n",setsize);
printf ("-T test set [%s]\n",testString);
exit (1);
}
}
srandom(seed);
initAlphabet();
if (loglevel>4) printf("size of each letterset=%d\n",setsize);
readWords();
for (;;) {
set = newSet();
testSet (set);
freeSet(set);
if (maxset>0 && setnum>=maxset) break;
}
free(allWords);
return 0;
}