|
|
Home » U++ Library support » U++ Libraries and TheIDE: i18n, Unicode and Internationalization » Adding new .scd spelling dictionary
Adding new .scd spelling dictionary [message #23536] |
Thu, 29 October 2009 11:28 |
|
koldo
Messages: 3360 Registered: August 2008
|
Senior Veteran |
|
|
Hello all
I would like to add a new .scd spelling dictionary and begin to use it to detect spelling errors when writing .tpp help files.
How can I create a new .scd file and add new words to it ?
Best regards
Koldo
Best regards
Iñaki
|
|
|
Re: Adding new .scd spelling dictionary [message #23553 is a reply to message #23536] |
Fri, 30 October 2009 16:48 |
|
mirek
Messages: 13975 Registered: November 2005
|
Ultimate Member |
|
|
koldo wrote on Thu, 29 October 2009 06:28 | Hello all
I would like to add a new .scd spelling dictionary and begin to use it to detect spelling errors when writing .tpp help files.
How can I create a new .scd file and add new words to it ?
Best regards
Koldo
|
Ha, good idea.
The problem is that existing .scd files were created way back in 2002 year. So the code can be outdated today.
And... it took me more than hour to find it. But I am glad we are about to refresh this one - and potentionally adding more .scd files.
Anyway, here we go:
#include <Speller/Speller.h>
byte charset;
int vocn[256];
Vector<String> voc;
VectorMap<int, String> line;
int LineCode(const String& s)
{
return ToLower(s[0], CHARSET_DEFAULT) +
(ToLower(s[1], CHARSET_DEFAULT) << 8) +
(ToLower(s[2], CHARSET_DEFAULT) << 16);
}
bool Contains(const String& a, const String& b)
{
for(int i = 0; i + b.GetLength() <= a.GetLength(); i++)
if(memcmp(~a + i, ~b, b.GetLength()) == 0) return true;
return false;
}
struct LengthOrder
{
bool operator()(const String& a, const String& b) const
{
return a.GetLength() > b.GetLength();
}
};
struct NoCaseOrder
{
bool operator()(const String& a, const String& b) const
{
String la = ToLower(a);
String lb = ToLower(b);
return la != lb ? la < lb : a > b;
}
};
void Make()
{
FileIn in("f:/dict/cs_cz.txt");
SetDefaultCharset(CHARSET_WIN1250);
Vector<String> w;
Index<int> alphabet;
String maxl;
int maxlen = 0;
while(!in.IsEof()) {
String l = in.GetLine();
if(l.GetLength() > maxlen) {
maxlen = l.GetLength();
maxl = l;
}
if(l.GetLength() > 1) {
if(l.GetLength() == 2)
l.Cat(127);
w.Add(l);
for(const char *s = l; s < l.End(); s++)
alphabet.FindAdd((byte)*s);
}
}
printf("Words loaded, now sorting\n");
ASSERT(maxlen < 64);
LOG("Maximal length:" << maxlen << " " << maxl);
Sort(w, NoCaseOrder());
printf("Sorted, now gathering voc candidates\n");
// ------------------
VectorMap<String, int> part;
int dict = 0;
int i = 0;
while(i < w.GetCount()) {
int linecode = LineCode(w[i]);
String prevw;
printf("line %s\n", ~ToLower(w[i].Mid(0, 3)));
while(i < w.GetCount() && LineCode(w[i]) == linecode) {
String ww = w[i];
for(int j = 0; j < prevw.GetLength(); j++)
if(ww[j] != prevw[j]) break;
if(j >= dict)
dict = j + 1;
for(int l = 2; l < ww.GetLength() - 1; l++)
for(int q = j; q + l <= ww.GetLength(); q++)
part.GetAdd(ww.Mid(q, l), 0)++;
prevw = ww;
i++;
}
}
printf("Creating voc\n");
int dcount = 256 - dict;
RLOG("dict: " << dict);
RLOG("dict size: " << dcount);
RLOG(" alphabet:" << alphabet.GetCount());
RLOG(" combinations: " << dcount - alphabet.GetCount());
for(i = 0; i < alphabet.GetCount(); i++)
voc.Add(String(alphabet[i], 1));
Vector<int> value;
for(i = 0; i < part.GetCount(); i++)
value.Add() = part[i] * (part.GetKey(i).GetLength() - 1);
while(voc.GetCount() + dict < 256) {
int m = 0;
int mi = 0;
int i;
for(i = 0; i < part.GetCount(); i++)
if(value[i] > m) {
m = value[i];
mi = i;
}
if(m <= 0) break;
String v = part.GetKey(mi);
vocn[voc.GetCount()] = value[mi];
voc.Add(v);
RLOG("Adding " << v << " value:" << value[mi] << " count:" << part[mi]);
printf("Adding %s value %d\n", ~v, value[mi]);
for(i = 0; i < part.GetCount(); i++) {
if(Contains(part.GetKey(i), v))
value[i] -= v.GetLength() * part[i];
if(Contains(v, part.GetKey(i)))
value[i] -= part.GetKey(i).GetLength() * part[i];
}
value[mi] = 0;
}
int sum = 0;
for(i = 0; i < voc.GetCount(); i++) {
sum += vocn[i];
RLOG(vocn[i] << " " << voc[i]);
}
RLOG("Total " << sum);
// ------------------
Sort(voc, LengthOrder());
i = 0;
while(i < w.GetCount()) {
int linecode = LineCode(w[i]);
String& ln = line.GetAdd(linecode);
printf("LINE %s\n", ToLower(~w[i].Mid(0, 3)));
RLOG("---- Line " << ToLower(~w[i].Mid(0, 3)));
String prevw;
bool next = false;
while(i < w.GetCount() && LineCode(w[i]) == linecode) {
String ww = w[i];
for(int j = 0; j < prevw.GetLength(); j++)
if(ww[j] != prevw[j]) break;
if(next)
ln.Cat(j);
RLOG(j << "\t" << w[i]);
next = true;
const char *s = ~ww + j;
while(*s) {
for(int i = 0; i < voc.GetCount(); i++) {
if(memcmp(s, voc[i], voc[i].GetLength()) == 0) {
RLOG(" " << s << " " << voc[i]);
ln.Cat(i + dict);
s += voc[i].GetLength();
break;
}
}
ASSERT(i < voc.GetCount());
}
prevw = ww;
i++;
}
RLOGHEXDUMP(ln, ln.GetLength());
}
int l = 0;
for(i = 0; i < line.GetCount(); i++) {
line[i].Cat(0);
l += line[i].GetLength();
}
FileOut out("F:/dict/x.spell");
out.Put(GetDefaultCharset());
out.Put(0);
out.Put(dict);
for(i = 0; i < voc.GetCount(); i++) {
out.Put(voc[i]);
out.Put(0);
}
for(i = 0; i < line.GetCount(); i++) {
out.PutL(line.GetKey(i));
out.PutL(line[i].GetLength());
out.Put(line[i]);
}
}
void Main()
{
Make();
}
(I have not even tried to compile that yet).
The input file is specified here:
void Make()
{
FileIn in("f:/dict/cs_cz.txt");
and it should be one word per line, all possible variants. The code compresses it to .scd format.
Mirek
[Updated on: Fri, 30 October 2009 16:49] Report message to a moderator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Goto Forum:
Current Time: Mon May 06 05:30:29 CEST 2024
Total time taken to generate the page: 0.02707 seconds
|
|
|