Overview
Examples
Screenshots
Comparisons
Applications
Download
Documentation
Tutorials
Bazaar
Status & Roadmap
FAQ
Authors & License
Forums
Funding Ultimate++
Search on this site
Search in forums












SourceForge.net Logo
Home » U++ Library support » U++ Libraries and TheIDE: i18n, Unicode and Internationalization » Adding new .scd spelling dictionary
Adding new .scd spelling dictionary [message #23536] Thu, 29 October 2009 11:28 Go to next message
koldo is currently offline  koldo
Messages: 3030
Registered: August 2008
Ultimate Member
Hello all

I would like to add a new .scd spelling dictionary and begin to use it to detect spelling errors when writing .tpp help files.

How can I create a new .scd file and add new words to it ?

Best regards
Koldo


Best regards
Iñaki
Re: Adding new .scd spelling dictionary [message #23553 is a reply to message #23536] Fri, 30 October 2009 16:48 Go to previous messageGo to next message
mirek is currently offline  mirek
Messages: 12098
Registered: November 2005
Ultimate Member
koldo wrote on Thu, 29 October 2009 06:28

Hello all

I would like to add a new .scd spelling dictionary and begin to use it to detect spelling errors when writing .tpp help files.

How can I create a new .scd file and add new words to it ?

Best regards
Koldo


Ha, good idea.

The problem is that existing .scd files were created way back in 2002 year. So the code can be outdated today.

And... it took me more than hour to find it. But I am glad we are about to refresh this one - and potentionally adding more .scd files.

Anyway, here we go:

#include <Speller/Speller.h>

byte                   charset;
int                    vocn[256];
Vector<String>         voc;
VectorMap<int, String> line;

int LineCode(const String& s)
{
	return ToLower(s[0], CHARSET_DEFAULT) +
	       (ToLower(s[1], CHARSET_DEFAULT) << 8) +
	       (ToLower(s[2], CHARSET_DEFAULT) << 16);
}

bool Contains(const String& a, const String& b)
{
	for(int i = 0; i + b.GetLength() <= a.GetLength(); i++)
		if(memcmp(~a + i, ~b, b.GetLength()) == 0) return true;
	return false;
}

struct LengthOrder
{
	bool operator()(const String& a, const String& b) const
	{
		return a.GetLength() > b.GetLength();
	}
};

struct NoCaseOrder
{
	bool operator()(const String& a, const String& b) const
	{
		String la = ToLower(a);
		String lb = ToLower(b);
		return la != lb ? la < lb : a > b;
	}
};

void Make()
{
	FileIn in("f:/dict/cs_cz.txt");
	SetDefaultCharset(CHARSET_WIN1250);
	Vector<String> w;
	Index<int> alphabet;
	String maxl;
	int    maxlen = 0;
	while(!in.IsEof()) {
		String l = in.GetLine();
		if(l.GetLength() > maxlen) {
			maxlen = l.GetLength();
			maxl = l;
		}
		if(l.GetLength() > 1) {
			if(l.GetLength() == 2)
				l.Cat(127);
			w.Add(l);
			for(const char *s = l; s < l.End(); s++)
				alphabet.FindAdd((byte)*s);
		}
	}
	
	printf("Words loaded, now sorting\n");
	
	ASSERT(maxlen < 64);
	
	LOG("Maximal length:" << maxlen << "  " << maxl);

	Sort(w, NoCaseOrder());
	
	printf("Sorted, now gathering voc candidates\n");

// ------------------
	
	VectorMap<String, int> part;
	int dict = 0;
	int i = 0;
	while(i < w.GetCount()) {
		int linecode = LineCode(w[i]);
		String prevw;
		printf("line %s\n", ~ToLower(w[i].Mid(0, 3)));
		while(i < w.GetCount() && LineCode(w[i]) == linecode) {
			String ww = w[i];
			for(int j = 0; j < prevw.GetLength(); j++)
				if(ww[j] != prevw[j]) break;
			if(j >= dict)
				dict = j + 1;
			for(int l = 2; l < ww.GetLength() - 1; l++)
				for(int q = j; q + l <= ww.GetLength(); q++)
					part.GetAdd(ww.Mid(q, l), 0)++;
			prevw = ww;
			i++;
		}
	}
	
	printf("Creating voc\n");

	int dcount = 256 - dict;
	RLOG("dict: " << dict);
	RLOG("dict size: " << dcount);
	RLOG(" alphabet:" << alphabet.GetCount());
	RLOG(" combinations: " << dcount - alphabet.GetCount());
	
	for(i = 0; i < alphabet.GetCount(); i++)
		voc.Add(String(alphabet[i], 1));

	Vector<int> value;

	for(i = 0; i < part.GetCount(); i++)
		value.Add() = part[i] * (part.GetKey(i).GetLength() - 1);

	while(voc.GetCount() + dict < 256) {
		int m = 0;
		int mi = 0;
		int i;
		for(i = 0; i < part.GetCount(); i++)
			if(value[i] > m) {
				m = value[i];
				mi = i;
			}
		if(m <= 0) break;
		String v = part.GetKey(mi);
		vocn[voc.GetCount()] = value[mi];
		voc.Add(v);
		RLOG("Adding " << v << " value:" << value[mi] << " count:" << part[mi]);
		printf("Adding %s value %d\n", ~v, value[mi]);
		for(i = 0; i < part.GetCount(); i++) {
			if(Contains(part.GetKey(i), v))
				value[i] -= v.GetLength() * part[i];
			if(Contains(v, part.GetKey(i)))
				value[i] -= part.GetKey(i).GetLength() * part[i];
		}
		value[mi] = 0;
	}
	
	int sum = 0;
	for(i = 0; i < voc.GetCount(); i++) {
		sum += vocn[i];
		RLOG(vocn[i] << "  " << voc[i]);
	}
	RLOG("Total " << sum);

// ------------------
	Sort(voc, LengthOrder());

	i = 0;
	while(i < w.GetCount()) {
		int linecode = LineCode(w[i]);
		String& ln = line.GetAdd(linecode);
		printf("LINE %s\n", ToLower(~w[i].Mid(0, 3)));
		RLOG("---- Line " << ToLower(~w[i].Mid(0, 3)));
		String prevw;
		bool next = false;
		while(i < w.GetCount() && LineCode(w[i]) == linecode) {
			String ww = w[i];
			for(int j = 0; j < prevw.GetLength(); j++)
				if(ww[j] != prevw[j]) break;
			if(next)
				ln.Cat(j);
			RLOG(j << "\t" << w[i]);
			next = true;
			const char *s = ~ww + j;
			while(*s) {
				for(int i = 0; i < voc.GetCount(); i++) {
					if(memcmp(s, voc[i], voc[i].GetLength()) == 0) {
						RLOG("  " << s << " " << voc[i]);
						ln.Cat(i + dict);
						s += voc[i].GetLength();
						break;
					}
				}
				ASSERT(i < voc.GetCount());
			}
			prevw = ww;
			i++;
		}
		RLOGHEXDUMP(ln, ln.GetLength());
	}
	int l = 0;
	for(i = 0; i < line.GetCount(); i++) {
		line[i].Cat(0);
		l += line[i].GetLength();
	}
	FileOut out("F:/dict/x.spell");
	out.Put(GetDefaultCharset());
	out.Put(0);
	out.Put(dict);
	for(i = 0; i < voc.GetCount(); i++) {
		out.Put(voc[i]);
		out.Put(0);
	}
	for(i = 0; i < line.GetCount(); i++) {
		out.PutL(line.GetKey(i));
		out.PutL(line[i].GetLength());
		out.Put(line[i]);
	}
}

void Main()
{
	Make();
}


(I have not even tried to compile that yet).

The input file is specified here:

void Make()
{
	FileIn in("f:/dict/cs_cz.txt");


and it should be one word per line, all possible variants. The code compresses it to .scd format.

Mirek

[Updated on: Fri, 30 October 2009 16:49]

Report message to a moderator

Re: Adding new .scd spelling dictionary [message #23559 is a reply to message #23553] Sat, 31 October 2009 00:05 Go to previous messageGo to next message
koldo is currently offline  koldo
Messages: 3030
Registered: August 2008
Ultimate Member
Hello Mirek

After some small changes the code works converting a text file to a .scd file.

Unfortunately I cannot "catch" that file from TheIde. It seems that the function Speller::Set from Speller.cpp does not understand the supplied .scd file.

Best regards
Koldo


Best regards
Iñaki
Re: Adding new .scd spelling dictionary [message #23564 is a reply to message #23559] Sat, 31 October 2009 11:40 Go to previous messageGo to next message
mirek is currently offline  mirek
Messages: 12098
Registered: November 2005
Ultimate Member
Do you think you could send the source .txt to my email? (If it is not too big).

Mirek
Re: Adding new .scd spelling dictionary [message #23565 is a reply to message #23564] Sat, 31 October 2009 14:43 Go to previous messageGo to next message
koldo is currently offline  koldo
Messages: 3030
Registered: August 2008
Ultimate Member
Hello Mirek

I have just sent you tour program wit small changes to run.

Best regards
Koldo


Best regards
Iñaki
Re: Adding new .scd spelling dictionary [message #23566 is a reply to message #23565] Sat, 31 October 2009 14:58 Go to previous messageGo to next message
koldo is currently offline  koldo
Messages: 3030
Registered: August 2008
Ultimate Member
Hello Mirek

A little trick I am doing is using a renamed en-us.scd with my language, and adding all new words in an .usp file. Of course this is not ok as all English words are considered right.

As now your are handling this code, perhaps you could do a little improvement: consider the same upper and lower case words, because if not all words have to be at least duplicated (Hello, hello or HELLO are considered different).

Best regards
Koldo


Best regards
Iñaki
Re: Adding new .scd spelling dictionary [message #23567 is a reply to message #23566] Sat, 31 October 2009 22:52 Go to previous messageGo to next message
mirek is currently offline  mirek
Messages: 12098
Registered: November 2005
Ultimate Member
koldo wrote on Sat, 31 October 2009 09:58

Hello Mirek

A little trick I am doing is using a renamed en-us.scd with my language, and adding all new words in an .usp file. Of course this is not ok as all English words are considered right.

As now your are handling this code, perhaps you could do a little improvement: consider the same upper and lower case words, because if not all words have to be at least duplicated (Hello, hello or HELLO are considered different).



Some words can only be written with capital first letter...

Mirek
Re: Adding new .scd spelling dictionary [message #23568 is a reply to message #23565] Sat, 31 October 2009 22:53 Go to previous messageGo to next message
mirek is currently offline  mirek
Messages: 12098
Registered: November 2005
Ultimate Member
koldo wrote on Sat, 31 October 2009 09:43

Hello Mirek

I have just sent you tour program wit small changes to run.

Best regards
Koldo


Uh, not program, the input file (spanish words).... Smile

Mirek
Re: Adding new .scd spelling dictionary [message #23569 is a reply to message #23568] Sat, 31 October 2009 23:45 Go to previous messageGo to next message
koldo is currently offline  koldo
Messages: 3030
Registered: August 2008
Ultimate Member
Hello Mirek

I have just sent you a file with just few words.
If everything is ok I would try to search for some open source of spelling dictionary.
And with all of that to use it in software, adding the possibility of proposing right words in case of spelling mistake, would not be bad ... Smile

Best regards
Koldo


Best regards
Iñaki
Re: Adding new .scd spelling dictionary [message #23577 is a reply to message #23569] Mon, 02 November 2009 10:57 Go to previous messageGo to next message
mirek is currently offline  mirek
Messages: 12098
Registered: November 2005
Ultimate Member
It took me longer than I have expected, but I believe it is now working. I have put the packake into uppsrc.

(The problem I had to solve was that original format expected local 8-bit encoding. I have changed this to UTF-8).
Re: Adding new .scd spelling dictionary [message #23583 is a reply to message #23536] Mon, 02 November 2009 11:52 Go to previous messageGo to next message
koldo is currently offline  koldo
Messages: 3030
Registered: August 2008
Ultimate Member
Hello Mirek

Thank you for your work Smile

Please tell me if I am doing it right. These are the steps I do:

- Prepare input.txt file with a word per row in the language to check spelling
- Save it in UTF8 with BOM
- Run makespellscd input.txt es-es.scd
- Copy es-es.scd to upp folder
- Recompile TheIde
- Run TheIde and open .tpp help
- Select all text and choose ES-ES language

With these steps done the spell checker does not find mistakes so ,as the .scd file has only few words, it does not work.
If instead of ES-ES y select EN-US, the spell checker detects the mistakes.

Best regards
Koldo


Best regards
Iñaki
Re: Adding new .scd spelling dictionary [message #23584 is a reply to message #23583] Mon, 02 November 2009 13:19 Go to previous messageGo to next message
mirek is currently offline  mirek
Messages: 12098
Registered: November 2005
Ultimate Member
- UTF8 WITHOUT BOM! (BOM would be added to the first word of dict).
- Also, make sure that only letters which REQUIRE initial (or other) capital letters have them. Capitals are enforced!

Also, you have to recompile theide, because of changed format of scd to support utf-8 (it is BW compatible, so no changes are needed for old .scds).

If you can get it work, it would be nice to establish scd folder on sf.net...

Mirek
Re: Adding new .scd spelling dictionary [message #23585 is a reply to message #23584] Mon, 02 November 2009 13:31 Go to previous messageGo to next message
mirek is currently offline  mirek
Messages: 12098
Registered: November 2005
Ultimate Member
Hard to say what goes wrong in your case. I have gone throught he process and it works for me (in WinXP).

Mirek
Re: Adding new .scd spelling dictionary [message #23587 is a reply to message #23585] Mon, 02 November 2009 16:55 Go to previous messageGo to next message
koldo is currently offline  koldo
Messages: 3030
Registered: August 2008
Ultimate Member
Sorry Mirek

Not possible to get it run for now.

I will try it this afternoon from a cleaner svn theide build.

Best regards
Koldo


Best regards
Iñaki
Re: Adding new .scd spelling dictionary [message #23593 is a reply to message #23587] Mon, 02 November 2009 23:01 Go to previous messageGo to next message
koldo is currently offline  koldo
Messages: 3030
Registered: August 2008
Ultimate Member
Hello Mirek

Excellent. It works perfectly.

Now does anybody know where to get free spelling dictionaries ?

Best regards
Koldo


Best regards
Iñaki
Re: Adding new .scd spelling dictionary [message #23594 is a reply to message #23593] Tue, 03 November 2009 02:15 Go to previous message
emr84 is currently offline  emr84
Messages: 26
Registered: April 2008
Location: Argentina
Promising Member
Maybe http://icon.shef.ac.uk/Moby/?
Previous Topic: Hack for MSC8 and "newline in constant" errors
Next Topic: how to change a line in string in zhCN translation file?
Goto Forum:
  


Current Time: Thu Nov 14 20:27:42 CET 2019

Total time taken to generate the page: 0.01771 seconds