|
|
Home » U++ Library support » U++ Libraries and TheIDE: i18n, Unicode and Internationalization » 16 bits wchar
|
|
|
|
Re: 16 bits wchar [message #12302 is a reply to message #8036] |
Wed, 24 October 2007 11:58   |
cbpporter
Messages: 1427 Registered: September 2007
|
Ultimate Contributor |
|
|
I've been sick and I didn't leave the house so I couldn't post. But here is my code:
int utf8codepointEE(const byte *s, const byte *z, int &lmod, int & dep)
{
if (s < z)
{
dword code = (byte)*s++;
int codePoint = 0;
if(code < 0x80)
{
dep = 1;
lmod = 1;
return code;
}
else if (code < 0xC2)
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
else if (code < 0xE0)
{
if(s >= z)
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
if (s[0] < 0x80 || s[0] >= 0xC0)
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
codePoint = ((code - 0xC0) << 6) + *s - 0x80;
if(codePoint < 0x80 || codePoint > 0x07FF)
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
else
{
dep = 2;
lmod = 2;
return codePoint;
}
}
else if (code < 0xF0)
{
if(s + 1 >= z)
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
if(s[0] < 0x80 || s[0] >= 0xC0 || s[1] < 0x80 || s[1] >= 0xC0)
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
codePoint = ((code - 0xE0) << 12) + ((s[0] - 0x80) << 6) + s[1] - 0x80;
if(codePoint < 0x0800 || codePoint > 0xFFFF)
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
else
{
dep = 3;
lmod = 3;
return codePoint;
}
}
else if (code < 0xF5)
{
if(s + 2 >= z)
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
if(s[0] < 0x80 || s[0] >= 0xc0 || s[1] < 0x80 || s[1] >= 0xc0 ||
s[2] < 0x80 || s[2] >= 0xc0)
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
codePoint = ((code - 0xf0) << 18) + ((s[0] - 0x80) << 12) +
((s[1] - 0x80) << 6) + s[2] - 0x80;
if(codePoint < 0x010000 || codePoint > 0x10FFFF)
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
else
{
dep = 4;
lmod = 4;
return codePoint;
}
}
else
{
dep = 1;
lmod = 3;
return 0xEE00 + code;
}
}
else
return -1;
}
int utf8lenEE(const char *_s, int len)
{
const byte *s = (const byte *)_s;
const byte *lim = s + len;
int codePoint = 0;
int length = 0;
while(s < lim) {
int lmod, dep;
int codePoint = utf8codepointEE(s, lim, lmod, dep);
ASSERT(codePoint != -1);
length += lmod;
s += dep;
}
return length;
}
int utf8lenDeEE(const char *_s, int len)
{
const byte *s = (const byte *)_s;
const byte *lim = s + len;
int codePoint = 0;
int length = 0;
while(s < lim) {
int lmod, dep;
int codePoint = utf8codepointEE(s, lim, lmod, dep);
ASSERT(codePoint != -1);
if ((codePoint & 0xFFFFFF00) == 0xEE00)
{
length++;
s += dep;
}
else
{
length += lmod;
s += dep;
}
}
return length;
}
inline byte * putUtf8(byte *s, int codePoint)
{
if (codePoint < 0x80)
*s++ = codePoint;
else if (codePoint < 0x0800)
{
*s++ = 0xC0 | (codePoint >> 6);
*s++ = 0x80 | (codePoint & 0x3f);
}
else if (codePoint < 0xFFFF)
{
*s++ = 0xE0 | (codePoint >> 12);
*s++ = 0x80 | (codePoint >> 6) & 0x3F;
*s++ = 0x80 | (codePoint & 0x3F);
}
else
{
*s++ = 0xF0 | (codePoint >> 18);
*s++ = 0x80 | (codePoint >> 12) & 0x3F;
*s++ = 0x80 | (codePoint >> 6) & 0x3F;
*s++ = 0x80 | (codePoint & 0x3F);
}
return s;
}
String ToUtf8EE(const char *_s, int _len)
{
int tlen = utf8lenEE(_s, _len);
if (tlen == -1)
return "";
StringBuffer result(tlen);
byte *s = (byte *) _s;
const byte *lim = s + _len;
byte *z = (byte *) ~result;
int length = 0;
while(s < lim) {
int lmod, dep;
int codePoint = utf8codepointEE(s, lim, lmod, dep);
if (codePoint == -1)
return "";
length += lmod;
s += dep;
z = putUtf8(z, codePoint);
}
ASSERT(length == tlen);
return result;
}
String FromUtf8EE(const char *_s, int _len)
{
int tlen = utf8lenDeEE(_s, _len);
if (tlen == -1)
return "";
StringBuffer result(tlen);
byte *s = (byte *) _s;
const byte *lim = s + _len;
byte *z = (byte *) ~result;
int length = 0;
while(s < lim) {
int lmod, dep;
int codePoint = utf8codepointEE(s, lim, lmod, dep);
if (codePoint == -1)
return "";
if ((codePoint & 0xFFFFFF00) == 0xEE00)
{
codePoint -= 0xEE00;
*z++ = codePoint;
lmod = 1;
}
else
z = putUtf8(z, codePoint);
length += lmod;
s += dep;
}
ASSERT(length == tlen);
return result;
}
It is up to you to decide what exactly you want to do with Unicode. And if you let me know, I could help. So please decide, and if you want to leave it as it is, I will find something else to work on.
|
|
|
Re: 16 bits wchar [message #12303 is a reply to message #12302] |
Wed, 24 October 2007 13:27   |
 |
mirek
Messages: 14257 Registered: November 2005
|
Ultimate Member |
|
|
I like the code. However, I still do not see too many practical uses.
Therefore: new package UnicodeEx is perhaps a right place where to save it, agreed?
As for future plans, yes, I think that going 32bits is the ultimate solution. Anyway, before that, I would like to see many other things resolved in U++. IMO, RTL support is now the priority in this area. Maybe, if you like to play with language issues, you can invest your spare time there... I do not expect the actual code, rather informations.
Mirek
|
|
|
|
Re: 16 bits wchar [message #12333 is a reply to message #12308] |
Thu, 25 October 2007 14:47   |
cbpporter
Messages: 1427 Registered: September 2007
|
Ultimate Contributor |
|
|
I also wrote a conversion algorithm from Utf8 to Utf16, which is quite similar to my previous one.
Since I already done these, I would like to optimize them a little. I have a couple of questions though.
1. My code point extraction routine is a little to long and quite redundant. The same sequence that handles an incorrect value is called a lot of times. I would like to get rid of these repetitions. I could use a macro, but I don't like to expose a dangerous macro to the rest of the file, so I could undef it after the function. Or, this would be the perfect case to where the use of goto could be justified and almost needed. Can I use goto?
EDIT:
P.S.: The modifed version of utf8codepointEE optimized and using goto is 49 lines long, while the original was 115. And I still consider it pretty clear, maybe even more clear because I can it on one screen.
2. The algorithm is a little bit inefficient because it first calculates the length of the new buffer, and then it fills it. But by calculating the length, we already get enough info to populate it with the correct values, reducing the number of calculations by half. But if I do this, I would need to preallocate first a possibly bigger than necessary buffer, fill it directly, than copy it in the new string and free the buffer. This has one extra allocation, and I'm not sure how efficient allocations are in U++. AFAIK, you replaced the default allocator. If it has similar efficiency as the standard one, the price of the allocation is not that large, but maybe you are against this approach as it more STL like, with allocating a lot of extra data and doing copies. If the allocator is faster, or if you have a caching mechanism, than I think that it could be a lot faster this way.
[Updated on: Thu, 25 October 2007 15:57] Report message to a moderator
|
|
|
|
Re: 16 bits wchar [message #12388 is a reply to message #12333] |
Sat, 27 October 2007 11:11   |
 |
mirek
Messages: 14257 Registered: November 2005
|
Ultimate Member |
|
|
cbpporter wrote on Thu, 25 October 2007 08:47 | I also wrote a conversion algorithm from Utf8 to Utf16, which is quite similar to my previous one.
Since I already done these, I would like to optimize them a little. I have a couple of questions though.
1. My code point extraction routine is a little to long and quite redundant. The same sequence that handles an incorrect value is called a lot of times. I would like to get rid of these repetitions. I could use a macro, but I don't like to expose a dangerous macro to the rest of the file, so I could undef it after the function. Or, this would be the perfect case to where the use of goto could be justified and almost needed. Can I use goto?
|
Of course. I have no problem with using anything in IMPLEMENTATION. I believe that the main task is to keep interfaces clear. If goto or macro are able to speedup or simplify things, go for it. No need to undefine macro either, as long as it is used in .cpp only.
(The only things I would ask in implementation: If you decide to use platform/machine/CPU specific things like "asm", be sure to provide crossplatform "default" implementation, or at least implement it for all supported platforms).
Quote: |
2. The algorithm is a little bit inefficient because it first calculates the length of the new buffer, and then it fills it. But by calculating the length, we already get enough info to populate it with the correct values, reducing the number of calculations by half. But if I do this, I would need to preallocate first a possibly bigger than necessary buffer, fill it directly, than copy it in the new string and free the buffer. This has one extra allocation, and I'm not sure how efficient allocations are in U++.
|
I believe they are quite efficient. Most of time, about 20 CPU instructions have to be executed to allocate a memory block.
However, I am a little bit afraid that the "copy" will make it inefficient...
OTOH, IME, guessing never helps to resolve optimization issues. If you really want to play hard, benchmark 
Also consider putting StringBuffer to the mix as well.
Quote: |
If the allocator is faster, or if you have a caching mechanism, than I think that it could be a lot faster this way.
|
Well, it is as fast as to make the STL idea of speed optimized allocators in container templates obsolete 
Mirek
|
|
|
Re: 16 bits wchar [message #12504 is a reply to message #12388] |
Tue, 06 November 2007 13:31   |
cbpporter
Messages: 1427 Registered: September 2007
|
Ultimate Contributor |
|
|
I've been a little busy the past days, so I didn't have time to benchmark stuff or do more optimizations. I attached my to modified files, because I'm tired of copying and pasting so much code, making the thread hard to read.
I also looked over the RTL issue. I used the resources from unicode.org, and mainly the "bidirectional algorithm". It is not that hard, but you have to split the text in paragraphs, than lines, then compute the direction based on control chars, create a dummy string and display it. If you add cursor movement, I think the issue is not that simple.
The question is how far do you want to go with RTL. The simplest solution is to just add a right click option to editable texts or to check the first character of a string to make sure that it is not a RTL mark. Or you could implement the full algorithm. And also, these control characters must be exclude from searches and other string comparison operations.
-
Attachment: CharSet.zip
(Size: 19.67KB, Downloaded 490 times)
|
|
|
Re: 16 bits wchar [message #12564 is a reply to message #12504] |
Fri, 09 November 2007 10:39   |
 |
mirek
Messages: 14257 Registered: November 2005
|
Ultimate Member |
|
|
cbpporter wrote on Tue, 06 November 2007 07:31 | I've been a little busy the past days, so I didn't have time to benchmark stuff or do more optimizations. I attached my to modified files, because I'm tired of copying and pasting so much code, making the thread hard to read.
I also looked over the RTL issue. I used the resources from unicode.org, and mainly the "bidirectional algorithm". It is not that hard, but you have to split the text in paragraphs, than lines, then compute the direction based on control chars, create a dummy string and display it. If you add cursor movement, I think the issue is not that simple.
The question is how far do you want to go with RTL. The simplest solution is to just add a right click option to editable texts or to check the first character of a string to make sure that it is not a RTL mark. Or you could implement the full algorithm. And also, these control characters must be exclude from searches and other string comparison operations.
|
Hm, have not we agreed to produce UnicodeEx package?
Mirek
|
|
|
|
|
|
|
|
Re: 16 bits wchar [message #17208 is a reply to message #16970] |
Sat, 02 August 2008 13:27   |
cbpporter
Messages: 1427 Registered: September 2007
|
Ultimate Contributor |
|
|
I have finally made some progress on this!
But not under Linux. I just couldn't get characters outside BMP to print, because all the characters were interpreted as two. Anyway, it is surely possible since most applications do manage to print them, but since I never coded for X before, probably I'm doing something wrong.
There is also a funny little story with me installing everything my distro had regarding fonts in hope of improving the number of displayable characters. It turns out that everything was almost 1Gb of fonts and related stuff and now I do have some extra fonts visible, but with the price of any drawing operation being slowed down to a crawl. So we have here a classical less is more situation.
But under windows I'm having better luck and am now displaying almost the full range of the JIS standard characters! Surrogate pairs are enabled by default, but I needed to install some free fonts. It is strange that still this is not enough, and I had to add some fallback fonts to the registry to get the display working. I guess Windows does not search every possible font for the characters, and somehow filters them, excluding the font that are needed. U++ does do any extra searching in different fonts under Windows (and Linux), so maybe we need to take into account somehow these registry settings.
From U++'s point of view, in order to get everything working I still need to get GetTestSize/FontInfo::GetCM working with surrogate pairs.
Do you know of other key functions or classes that I need to look over to get basic output working? And could you explain in a few words how font compositioning works for U++. I found the code, but font compositioning is not used when I try to draw text. It will probably need to be modified to get it to work with surrogates also.
|
|
|
Re: 16 bits wchar [message #17214 is a reply to message #17208] |
Sat, 02 August 2008 18:34   |
cbpporter
Messages: 1427 Registered: September 2007
|
Ultimate Contributor |
|
|
Great! I've gotten GetTextSize to work! I also investigated and fallback registry settings are not necessary for test drawing and size computation to work. For plane 0 characters, if font is available, character will be drawn (except in a case if will get back to later), and if not, little black rectangless will be drawn in correct position and size. Fallback font doesn't seem to be used at all. Maybe if I uninstall standard CJK fonts, Windows will start using fallback
For plane two, the situation is the same, except that fallback setting must be present in order for characters to be drawn. Without, even when font is present, placeholder rectangles will be drawn.
There is only one last problem. For some characters in plane 0, I can't get the character to show. All other Unicode enabled apps on my system render correctly, even without fallback setting, but in U++ these characters appear as little black boxes. A workaround is to specify a font name directly which supports the given characters. Using this workaround, I can get full JIS support with two extra free fonts and one registry setting. But specifying the font manually is not a long term solution, and I need to find out why U++ has problems with some characters in the following ranges: 0x3402-0x4d77 and 0xfa30-0xfa6a.
|
|
|
|
Goto Forum:
Current Time: Fri May 09 22:08:13 CEST 2025
Total time taken to generate the page: 0.01763 seconds
|
|
|