|
I want to read a xml file in Unicode, UTF-8 or a native encoding
into a wchar_t type string, so i write a routine as follows, however,
sometimes a Unicode file including Chinese character cannot be read
completely. and I cannot tell where its root located, so NEED your
help, GIVE me a hand please.
THX.
static Status LoadXMLFile2String(const char *filename, wchar_t *text){
FILE *f;
if(!(f = fopen(filename, "r"))){
__printDebugA("Input file %s cannot be opened.", filename);
return ERROR;
}
char *encoding;
//transform routine: other --unicode --other
const unsigned char UTF_8_HEAD[3] = {239, 187, 191};
const unsigned char UNICODE_HEAD[2] = {255, 254};
const unsigned char UNICODE_BIGENDIAN_HEAD[2] = {254, 255};
unsigned char head[3];
fread(head, 1, 3, f);
if(!memcmp(head, UNICODE_HEAD, 2)){
encoding = "UNICODE";
}
else if(!memcmp(head, UNICODE_BIGENDIAN_HEAD, 2)){
encoding = "UNICODE_BIGENDIAN";
}
else if(!memcmp(head, UTF_8_HEAD, 3)){
encoding = "UTF_8";
}
else{
encoding = "ANSI";
}
char *str = (char *) malloc((MAXXMLFILESIZE + 1) * sizeof(char));
int i = 0;
if(!strcmp(encoding, "ANSI")){
str[0] = head[0];
str[1] = head[1];
str[2] = head[2];
i = 3;
}
else if(!strcmp(encoding, "UNICODE") || !strcmp(encoding,
"UNICODE_BIGENDIAN")){
str[0] = head[2];
i = 1;
}
while(!feof(f)){
if(i >= MAXXMLFILESIZE){
db_error(L"The file is too large.");
return ERROR;
}
str[i] = fgetc(f);
i++;
}
str[i] = '\0';
if(!strcmp(encoding, "UNICODE")){
for(int j = 0; j < i - 1; j++){
if(j % 2){
text[j/2] += ((unsigned char) str[j]) << 8;
}
else{
text[j/2] = (unsigned char) str[j];
}
}
text[j/2] = 0;
//db_debug(L"%d", wcslen(text));
}
else if(!strcmp(encoding, "UNICODE_BIGENDIAN")){
for(int j = 0; j < i; j++){
if(j % 2){
text[j/2] = (text[j/2] << 8) + (unsigned char) str[j];
}
else{
text[j/2] = (unsigned char) str[j];
}
}
text[j/2] = 0;
}
else if(!strcmp(encoding, "UTF_8")){
UTF2Unicode(str, text);
}
else if(!strcmp(encoding, "ANSI")){
setlocale(LC_CTYPE, "");
mbstowcs(text, str, MAXXMLFILESIZE + 1);
}
else{
assert(FALSE);
}
free(str);
fclose(f);
return OK;
} | |
Share:
|
help
"st******@gmail.com дµÀ£º
"
I want to read a xml file in Unicode, UTF-8 or a native encoding
into a wchar_t type string, so i write a routine as follows, however,
sometimes a Unicode file including Chinese character cannot be read
completely. and I cannot tell where its root located, so NEED your
help, GIVE me a hand please.
THX.
static Status LoadXMLFile2String(const char *filename, wchar_t *text){
FILE *f;
if(!(f = fopen(filename, "r"))){
__printDebugA("Input file %s cannot be opened.", filename);
return ERROR;
}
char *encoding;
//transform routine: other --unicode --other
const unsigned char UTF_8_HEAD[3] = {239, 187, 191};
const unsigned char UNICODE_HEAD[2] = {255, 254};
const unsigned char UNICODE_BIGENDIAN_HEAD[2] = {254, 255};
unsigned char head[3];
fread(head, 1, 3, f);
if(!memcmp(head, UNICODE_HEAD, 2)){
encoding = "UNICODE";
}
else if(!memcmp(head, UNICODE_BIGENDIAN_HEAD, 2)){
encoding = "UNICODE_BIGENDIAN";
}
else if(!memcmp(head, UTF_8_HEAD, 3)){
encoding = "UTF_8";
}
else{
encoding = "ANSI";
}
char *str = (char *) malloc((MAXXMLFILESIZE + 1) * sizeof(char));
int i = 0;
if(!strcmp(encoding, "ANSI")){
str[0] = head[0];
str[1] = head[1];
str[2] = head[2];
i = 3;
}
else if(!strcmp(encoding, "UNICODE") || !strcmp(encoding,
"UNICODE_BIGENDIAN")){
str[0] = head[2];
i = 1;
}
while(!feof(f)){
if(i >= MAXXMLFILESIZE){
db_error(L"The file is too large.");
return ERROR;
}
str[i] = fgetc(f);
i++;
}
str[i] = '\0';
if(!strcmp(encoding, "UNICODE")){
for(int j = 0; j < i - 1; j++){
if(j % 2){
text[j/2] += ((unsigned char) str[j]) << 8;
}
else{
text[j/2] = (unsigned char) str[j];
}
}
text[j/2] = 0;
//db_debug(L"%d", wcslen(text));
}
else if(!strcmp(encoding, "UNICODE_BIGENDIAN")){
for(int j = 0; j < i; j++){
if(j % 2){
text[j/2] = (text[j/2] << 8) + (unsigned char) str[j];
}
else{
text[j/2] = (unsigned char) str[j];
}
}
text[j/2] = 0;
}
else if(!strcmp(encoding, "UTF_8")){
UTF2Unicode(str, text);
}
else if(!strcmp(encoding, "ANSI")){
setlocale(LC_CTYPE, "");
mbstowcs(text, str, MAXXMLFILESIZE + 1);
}
else{
assert(FALSE);
}
free(str);
fclose(f);
return OK;
}
| | | st******@gmail.com wrote:
I want to read a xml file in Unicode, UTF-8 or a native encoding
into a wchar_t type string, so i write a routine as follows, however,
sometimes a Unicode file including Chinese character cannot be read
completely. and I cannot tell where its root located, so NEED your
help, GIVE me a hand please.
THX.
[code sniped]
This code is horrible on so many levels. Mostly I suspect because it is
in C rather than C++.
You will have something much easier to work with if you reformulate
this in C++ and apply some more useful abstractions to it.
As for your error, you are only checking a few encodings and assuming
that there is a BOM to tell you which to use. You need to check the XML
prolog. It may be that the Chinese file is using a different encoding.
K | | This discussion thread is closed Replies have been disabled for this discussion. Similar topics
4 posts
views
Thread by Achim Domma |
last post: by
|
17 posts
views
Thread by Guyon Morée |
last post: by
|
5 posts
views
Thread by wolfgang haefelinger |
last post: by
|
3 posts
views
Thread by hunterb |
last post: by
|
2 posts
views
Thread by hezhenjie@gmail.com |
last post: by
|
1 post
views
Thread by Jordan |
last post: by
|
10 posts
views
Thread by Tibby |
last post: by
|
1 post
views
Thread by markww |
last post: by
|
14 posts
views
Thread by Zoro |
last post: by
| | | | | | | | | | |