1
0
mirror of https://github.com/VDR4Arch/vdr.git synced 2023-10-10 13:36:52 +02:00

Implemented character set conversion in 'libsi'

This commit is contained in:
Klaus Schmidinger 2007-04-22 14:49:26 +02:00
parent 6d88da9385
commit 2ac9030e65
4 changed files with 217 additions and 30 deletions

21
HISTORY
View File

@ -5139,10 +5139,29 @@ Video Disk Recorder Revision History
parameter to 0 turns off the automatic channel switching, and the user will
have to confirm the entry by pressing the "Ok" key.
2007-03-10: Version 1.5.2
2007-04-22: Version 1.5.2
- Updated the Finnish OSD texts (thanks to Rolf Ahrenberg).
- Fixed handling user activity for shutdown, which I had messed when adopting Udo's
original patch (thanks to Udo Richter).
- Added Turkish language texts (thanks to Oktay Yolgeçen).
- Added missing rules for generating iso8859-13 font to Makefile.
- 'libsi' now converts the incoming strings into the system's character set
according to the DVB standard. The system's character set is determined from
the LANG environment variable. If no recognizable setting can be found, no
conversion will take place. Note that currently only the strings received from the
SI data stream are converted, there have not been any changes regarding displaying
UTF-8 characters on the OSD, yet - this will follow in one of the next steps.
With this conversion, it should now be safe to run VDR on a UTF-8 file system,
because all incoming characters are converted to UTF-8. This will most likely
result in wrong characters being displayed on the OSD (because there UTF-8 is
not known, yet), but the file names should be ok (haven't tested this myself,
though, because I don't do UTF-8 - so please be very careful when testing!).
There's one piece of bad news here: the German pay-tv broadcaster Premiere
apparently encodes all EPG strings as ISO8859-1, but fails to correctly mark
these strings as such. Therefore 'libsi' (following the DVB standard) considers
the strings to be encoded in the default ISO6937 and converts them to whatever
the system's character set is. This, of course, results in wrong umlauts.
On its old transponder, the ProSieben/SAT.1 channels also had their EPG data
wrongly encoded, but apparently on the new transponder they started broadcasting
on this month, they got it right.

View File

@ -6,12 +6,15 @@
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* $Id: si.c 1.17 2007/02/03 11:45:58 kls Exp $
* $Id: si.c 1.18 2007/04/22 13:56:39 kls Exp $
* *
***************************************************************************/
#include <string.h>
#include "si.h"
#include <errno.h>
#include <iconv.h>
#include <malloc.h>
#include <string.h>
#include "descriptor.h"
namespace SI {
@ -232,7 +235,6 @@ char *String::getText(char *buffer, int size) {
return buffer;
}
//taken from VDR, Copyright Klaus Schmidinger <kls@cadsoft.de>
char *String::getText(char *buffer, char *shortVersion, int sizeBuffer, int sizeShortVersion) {
int len=getLength();
if (len < 0 || len >= sizeBuffer) {
@ -245,21 +247,163 @@ char *String::getText(char *buffer, char *shortVersion, int sizeBuffer, int size
return buffer;
}
//taken from libdtv, Copyright Rolf Hakenes <hakenes@hippomi.de>
static const char *CharacterTables1[] = {
NULL, // 0x00
"ISO8859-5", // 0x01
"ISO8859-6", // 0x02
"ISO8859-7", // 0x03
"ISO8859-8", // 0x04
"ISO8859-9", // 0x05
"ISO8859-10", // 0x06
"ISO8859-11", // 0x07
"ISO8859-12", // 0x08
"ISO8859-13", // 0x09
"ISO8859-14", // 0x0A
"ISO8859-15", // 0x0B
NULL, // 0x0C
NULL, // 0x0D
NULL, // 0x0E
NULL, // 0x0F
NULL, // 0x10
"UTF16", // 0x11
"EUC-KR", // 0x12
"GB2312", // 0x13
"GBK", // 0x14
"UTF8", // 0x15
NULL, // 0x16
NULL, // 0x17
NULL, // 0x18
NULL, // 0x19
NULL, // 0x1A
NULL, // 0x1B
NULL, // 0x1C
NULL, // 0x1D
NULL, // 0x1E
NULL, // 0x1F
};
#define SingleByteLimit 0x0B
static const char *CharacterTables2[] = {
NULL, // 0x00
"ISO8859-1", // 0x01
"ISO8859-2", // 0x02
"ISO8859-3", // 0x03
"ISO8859-4", // 0x04
"ISO8859-5", // 0x05
"ISO8859-6", // 0x06
"ISO8859-7", // 0x07
"ISO8859-8", // 0x08
"ISO8859-9", // 0x09
"ISO8859-10", // 0x0A
"ISO8859-11", // 0x0B
NULL, // 0x0C
"ISO8859-13", // 0x0D
"ISO8859-14", // 0x0E
"ISO8859-15", // 0x0F
};
#define NumEntries(Table) (sizeof(Table) / sizeof(char *))
static const char *SystemCharacterTable = NULL;
bool SystemCharacterTableIsSingleByte = true;
bool SetSystemCharacterTable(const char *CharacterTable) {
if (CharacterTable) {
for (unsigned int i = 0; i < NumEntries(CharacterTables1); i++) {
if (CharacterTables1[i] && strcasecmp(CharacterTable, CharacterTables1[i]) == 0) {
SystemCharacterTable = CharacterTables1[i];
SystemCharacterTableIsSingleByte = i <= SingleByteLimit;
return true;
}
}
for (unsigned int i = 0; i < NumEntries(CharacterTables2); i++) {
if (CharacterTables2[i] && strcasecmp(CharacterTable, CharacterTables2[i]) == 0) {
SystemCharacterTable = CharacterTables2[i];
SystemCharacterTableIsSingleByte = true;
return true;
}
}
} else {
SystemCharacterTable = NULL;
SystemCharacterTableIsSingleByte = true;
return true;
}
return false;
}
// Determines the character table used in the given buffer and returns
// a string indicating that table. If no table can be determined, the
// default ISO6937 is returned. If a table can be determined, the buffer
// and length are adjusted accordingly.
static const char *getCharacterTable(const unsigned char *&buffer, int &length, bool *isSingleByte = NULL) {
const char *cs = "ISO6937";
if (isSingleByte)
*isSingleByte = false;
if (length <= 0)
return cs;
unsigned int tag = buffer[0];
if (tag >= 0x20)
return cs;
if (tag == 0x10) {
if (length >= 3) {
tag = (buffer[1] << 8) | buffer[2];
if (tag < NumEntries(CharacterTables2) && CharacterTables2[tag]) {
buffer += 3;
length -= 3;
if (isSingleByte)
*isSingleByte = true;
return CharacterTables2[tag];
}
}
} else if (tag < NumEntries(CharacterTables1) && CharacterTables1[tag]) {
buffer += 1;
length -= 1;
if (isSingleByte)
*isSingleByte = tag <= SingleByteLimit;
return CharacterTables1[tag];
}
return cs;
}
static bool convertCharacterTable(const char *from, size_t fromLength, char *to, size_t toLength, const char *fromCode)
{
if (SystemCharacterTable) {
iconv_t cd = iconv_open(SystemCharacterTable, fromCode);
if (cd >= 0) {
char *fromPtr = (char *)from;
while (fromLength > 0 && toLength > 1) {
if (iconv(cd, &fromPtr, &fromLength, &to, &toLength) == size_t(-1)) {
if (errno == EILSEQ) {
// A character can't be converted, so mark it with '?' and proceed:
fromPtr++;
fromLength--;
*to++ = '?';
toLength--;
}
else
break;
}
}
*to = 0;
iconv_close(cd);
return true;
}
}
return false;
}
// originally from libdtv, Copyright Rolf Hakenes <hakenes@hippomi.de>
void String::decodeText(char *buffer, int size) {
const unsigned char *from=data.getData(0);
char *to=buffer;
/* Disable detection of coding tables - libdtv doesn't do it either
if ( (0x01 <= *from) && (*from <= 0x1f) ) {
codeTable=*from
}
*/
if (*from == 0x10)
from += 3; // skips code table info
int len=getLength();
if (len <= 0) {
*to = '\0';
return;
}
bool singleByte;
const char *cs = getCharacterTable(from, len, &singleByte);
for (int i = 0; i < len; i++) {
if (*from == 0)
break;
@ -276,6 +420,11 @@ void String::decodeText(char *buffer, int size) {
break;
}
*to = '\0';
if (!singleByte || !SystemCharacterTableIsSingleByte) {
char convBuffer[size];
if (convertCharacterTable(buffer, strlen(buffer), convBuffer, sizeof(convBuffer), cs))
strncpy(buffer, convBuffer, strlen(convBuffer) + 1);
}
}
void String::decodeText(char *buffer, char *shortVersion, int sizeBuffer, int sizeShortVersion) {
@ -283,11 +432,14 @@ void String::decodeText(char *buffer, char *shortVersion, int sizeBuffer, int si
char *to=buffer;
char *toShort=shortVersion;
int IsShortName=0;
if (*from == 0x10)
from += 3; // skips code table info
int len=getLength();
if (len <= 0) {
*to = '\0';
*toShort = '\0';
return;
}
bool singleByte;
const char *cs = getCharacterTable(from, len, &singleByte);
for (int i = 0; i < len; i++) {
if ( ((' ' <= *from) && (*from <= '~'))
|| (*from == '\n')
@ -312,6 +464,14 @@ void String::decodeText(char *buffer, char *shortVersion, int sizeBuffer, int si
}
*to = '\0';
*toShort = '\0';
if (!singleByte || !SystemCharacterTableIsSingleByte) {
char convBuffer[sizeBuffer];
if (convertCharacterTable(buffer, strlen(buffer), convBuffer, sizeof(convBuffer), cs))
strncpy(buffer, convBuffer, strlen(convBuffer) + 1);
char convShortVersion[sizeShortVersion];
if (convertCharacterTable(shortVersion, strlen(shortVersion), convShortVersion, sizeof(convShortVersion), cs))
strncpy(shortVersion, convShortVersion, strlen(convShortVersion) + 1);
}
}
Descriptor *Descriptor::getDescriptor(CharArray da, DescriptorTagDomain domain, bool returnUnimplemetedDescriptor) {

View File

@ -6,7 +6,7 @@
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* $Id: si.h 1.16 2007/02/03 11:47:25 kls Exp $
* $Id: si.h 1.17 2007/04/22 13:32:09 kls Exp $
* *
***************************************************************************/
@ -486,6 +486,11 @@ protected:
void decodeText(char *buffer, char *shortVersion, int sizeBuffer, int sizeShortVersion);
};
// Call this function to set the system character table. CharacterTable is a string
// like "iso8859-15" or "utf-8" (case insensitive).
// Returns true if the character table was recognized.
bool SetSystemCharacterTable(const char *CharacterTable);
} //end of namespace
#endif //LIBSI_SI_H

23
vdr.c
View File

@ -22,7 +22,7 @@
*
* The project's page is at http://www.cadsoft.de/vdr
*
* $Id: vdr.c 1.286 2007/03/10 13:00:22 kls Exp $
* $Id: vdr.c 1.287 2007/04/22 13:28:32 kls Exp $
*/
#include <getopt.h>
@ -455,15 +455,6 @@ int main(int argc, char *argv[])
return 0;
}
// Check for UTF-8 and exit if present - asprintf() will fail if it encounters 8 bit ASCII codes
char *LangEnv;
if ((LangEnv = getenv("LANG")) != NULL && strcasestr(LangEnv, "utf") ||
(LangEnv = getenv("LC_ALL")) != NULL && strcasestr(LangEnv, "utf") ||
(LangEnv = getenv("LC_CTYPE")) != NULL && strcasestr(LangEnv, "utf")) {
fprintf(stderr, "vdr: please turn off UTF-8 before starting VDR\n");
return 2;
}
// Log file:
if (SysLogLevel > 0)
@ -500,6 +491,18 @@ int main(int argc, char *argv[])
dsyslog("running as daemon (tid=%d)", cThread::ThreadId());
cThread::SetMainThreadId();
// Set the system character table:
char *LangEnv = getenv("LANG");
if (LangEnv) {
char *CodeSet = strchr(LangEnv, '.');
if (CodeSet) {
CodeSet++; // skip the dot
bool known = SI::SetSystemCharacterTable(CodeSet);
isyslog("codeset is '%s' - %s", CodeSet, known ? "known" : "unknown");
}
}
// Main program loop variables - need to be here to have them initialized before any EXIT():
cOsdObject *Menu = NULL;