Implemented character set conversion in 'libsi'

2025-03-01 10:50:46 +00:00 · 2007-04-22 14:49:26 +02:00 · 2007-04-22 14:49:26 +02:00 · 2ac9030e65
commit 2ac9030e65
parent 6d88da9385
4 changed files with 217 additions and 30 deletions
--- a/21
+++ b/21
@ -5139,10 +5139,29 @@ Video Disk Recorder Revision History
  parameter to 0 turns off the automatic channel switching, and the user will
  have to confirm the entry by pressing the "Ok" key.

-2007-03-10: Version 1.5.2
+2007-04-22: Version 1.5.2

 - Updated the Finnish OSD texts (thanks to Rolf Ahrenberg).
 - Fixed handling user activity for shutdown, which I had messed when adopting Udo's
  original patch (thanks to Udo Richter).
 - Added Turkish language texts (thanks to Oktay Yolgeçen).
 - Added missing rules for generating iso8859-13 font to Makefile.
+- 'libsi' now converts the incoming strings into the system's character set
+  according to the DVB standard. The system's character set is determined from
+  the LANG environment variable. If no recognizable setting can be found, no
+  conversion will take place. Note that currently only the strings received from the
+  SI data stream are converted, there have not been any changes regarding displaying
+  UTF-8 characters on the OSD, yet - this will follow in one of the next steps.
+  With this conversion, it should now be safe to run VDR on a UTF-8 file system,
+  because all incoming characters are converted to UTF-8. This will most likely
+  result in wrong characters being displayed on the OSD (because there UTF-8 is
+  not known, yet), but the file names should be ok (haven't tested this myself,
+  though, because I don't do UTF-8 - so please be very careful when testing!).
+  There's one piece of bad news here: the German pay-tv broadcaster Premiere
+  apparently encodes all EPG strings as ISO8859-1, but fails to correctly mark
+  these strings as such. Therefore 'libsi' (following the DVB standard) considers
+  the strings to be encoded in the default ISO6937 and converts them to whatever
+  the system's character set is. This, of course, results in wrong umlauts.
+  On its old transponder, the ProSieben/SAT.1 channels also had their EPG data
+  wrongly encoded, but apparently on the new transponder they started broadcasting
+  on this month, they got it right.
--- a/libsi/si.c
+++ b/libsi/si.c
@ -6,12 +6,15 @@
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
- *   $Id: si.c 1.17 2007/02/03 11:45:58 kls Exp $
+ *   $Id: si.c 1.18 2007/04/22 13:56:39 kls Exp $
 *                                                                         *
 ***************************************************************************/

-#include <string.h>
 #include "si.h"
+#include <errno.h>
+#include <iconv.h>
+#include <malloc.h>
+#include <string.h>
 #include "descriptor.h"

 namespace SI {
@ -232,7 +235,6 @@ char *String::getText(char *buffer, int size) {
   return buffer;
 }

-//taken from VDR, Copyright Klaus Schmidinger <kls@cadsoft.de>
 char *String::getText(char *buffer, char *shortVersion, int sizeBuffer, int sizeShortVersion) {
   int len=getLength();
   if (len < 0 || len >= sizeBuffer) {
@ -245,21 +247,163 @@ char *String::getText(char *buffer, char *shortVersion, int sizeBuffer, int size
   return buffer;
 }

-//taken from libdtv, Copyright Rolf Hakenes <hakenes@hippomi.de>
+static const char *CharacterTables1[] = {
+  NULL,         // 0x00
+  "ISO8859-5",  // 0x01
+  "ISO8859-6",  // 0x02
+  "ISO8859-7",  // 0x03
+  "ISO8859-8",  // 0x04
+  "ISO8859-9",  // 0x05
+  "ISO8859-10", // 0x06
+  "ISO8859-11", // 0x07
+  "ISO8859-12", // 0x08
+  "ISO8859-13", // 0x09
+  "ISO8859-14", // 0x0A
+  "ISO8859-15", // 0x0B
+  NULL,         // 0x0C
+  NULL,         // 0x0D
+  NULL,         // 0x0E
+  NULL,         // 0x0F
+  NULL,         // 0x10
+  "UTF16",      // 0x11
+  "EUC-KR",     // 0x12
+  "GB2312",     // 0x13
+  "GBK",        // 0x14
+  "UTF8",       // 0x15
+  NULL,         // 0x16
+  NULL,         // 0x17
+  NULL,         // 0x18
+  NULL,         // 0x19
+  NULL,         // 0x1A
+  NULL,         // 0x1B
+  NULL,         // 0x1C
+  NULL,         // 0x1D
+  NULL,         // 0x1E
+  NULL,         // 0x1F
+};
+
+#define SingleByteLimit 0x0B
+
+static const char *CharacterTables2[] = {
+  NULL,         // 0x00
+  "ISO8859-1",  // 0x01
+  "ISO8859-2",  // 0x02
+  "ISO8859-3",  // 0x03
+  "ISO8859-4",  // 0x04
+  "ISO8859-5",  // 0x05
+  "ISO8859-6",  // 0x06
+  "ISO8859-7",  // 0x07
+  "ISO8859-8",  // 0x08
+  "ISO8859-9",  // 0x09
+  "ISO8859-10", // 0x0A
+  "ISO8859-11", // 0x0B
+  NULL,         // 0x0C
+  "ISO8859-13", // 0x0D
+  "ISO8859-14", // 0x0E
+  "ISO8859-15", // 0x0F
+};
+
+#define NumEntries(Table) (sizeof(Table) / sizeof(char *))
+
+static const char *SystemCharacterTable = NULL;
+bool SystemCharacterTableIsSingleByte = true;
+
+bool SetSystemCharacterTable(const char *CharacterTable) {
+   if (CharacterTable) {
+      for (unsigned int i = 0; i < NumEntries(CharacterTables1); i++) {
+         if (CharacterTables1[i] && strcasecmp(CharacterTable, CharacterTables1[i]) == 0) {
+            SystemCharacterTable = CharacterTables1[i];
+            SystemCharacterTableIsSingleByte = i <= SingleByteLimit;
+            return true;
+         }
+      }
+      for (unsigned int i = 0; i < NumEntries(CharacterTables2); i++) {
+         if (CharacterTables2[i] && strcasecmp(CharacterTable, CharacterTables2[i]) == 0) {
+            SystemCharacterTable = CharacterTables2[i];
+            SystemCharacterTableIsSingleByte = true;
+            return true;
+         }
+      }
+   } else {
+      SystemCharacterTable = NULL;
+      SystemCharacterTableIsSingleByte = true;
+      return true;
+   }
+   return false;
+}
+
+// Determines the character table used in the given buffer and returns
+// a string indicating that table. If no table can be determined, the
+// default ISO6937 is returned. If a table can be determined, the buffer
+// and length are adjusted accordingly.
+static const char *getCharacterTable(const unsigned char *&buffer, int &length, bool *isSingleByte = NULL) {
+   const char *cs = "ISO6937";
+   if (isSingleByte)
+      *isSingleByte = false;
+   if (length <= 0)
+      return cs;
+   unsigned int tag = buffer[0];
+   if (tag >= 0x20)
+      return cs;
+   if (tag == 0x10) {
+      if (length >= 3) {
+         tag = (buffer[1] << 8) | buffer[2];
+         if (tag < NumEntries(CharacterTables2) && CharacterTables2[tag]) {
+            buffer += 3;
+            length -= 3;
+            if (isSingleByte)
+               *isSingleByte = true;
+            return CharacterTables2[tag];
+         }
+      }
+   } else if (tag < NumEntries(CharacterTables1) && CharacterTables1[tag]) {
+      buffer += 1;
+      length -= 1;
+      if (isSingleByte)
+         *isSingleByte = tag <= SingleByteLimit;
+      return CharacterTables1[tag];
+   }
+   return cs;
+}
+
+static bool convertCharacterTable(const char *from, size_t fromLength, char *to, size_t toLength, const char *fromCode)
+{
+  if (SystemCharacterTable) {
+     iconv_t cd = iconv_open(SystemCharacterTable, fromCode);
+     if (cd >= 0) {
+        char *fromPtr = (char *)from;
+        while (fromLength > 0 && toLength > 1) {
+           if (iconv(cd, &fromPtr, &fromLength, &to, &toLength) == size_t(-1)) {
+              if (errno == EILSEQ) {
+                 // A character can't be converted, so mark it with '?' and proceed:
+                 fromPtr++;
+                 fromLength--;
+                 *to++ = '?';
+                 toLength--;
+              }
+              else
+                 break;
+           }
+        }
+        *to = 0;
+        iconv_close(cd);
+        return true;
+     }
+  }
+  return false;
+}
+
+// originally from libdtv, Copyright Rolf Hakenes <hakenes@hippomi.de>
 void String::decodeText(char *buffer, int size) {
   const unsigned char *from=data.getData(0);
   char *to=buffer;
-
-   /* Disable detection of coding tables - libdtv doesn't do it either
-   if ( (0x01 <= *from) && (*from <= 0x1f) ) {
-      codeTable=*from
-   }
-   */
-
-   if (*from == 0x10)
-      from += 3; // skips code table info
-
   int len=getLength();
+   if (len <= 0) {
+      *to = '\0';
+      return;
+      }
+   bool singleByte;
+   const char *cs = getCharacterTable(from, len, &singleByte);
   for (int i = 0; i < len; i++) {
      if (*from == 0)
         break;
@ -276,6 +420,11 @@ void String::decodeText(char *buffer, int size) {
         break;
   }
   *to = '\0';
+   if (!singleByte || !SystemCharacterTableIsSingleByte) {
+      char convBuffer[size];
+      if (convertCharacterTable(buffer, strlen(buffer), convBuffer, sizeof(convBuffer), cs))
+         strncpy(buffer, convBuffer, strlen(convBuffer) + 1);
+   }
 }

 void String::decodeText(char *buffer, char *shortVersion, int sizeBuffer, int sizeShortVersion) {
@ -283,11 +432,14 @@ void String::decodeText(char *buffer, char *shortVersion, int sizeBuffer, int si
   char *to=buffer;
   char *toShort=shortVersion;
   int IsShortName=0;
-
-   if (*from == 0x10)
-      from += 3; // skips code table info
-
   int len=getLength();
+   if (len <= 0) {
+      *to = '\0';
+      *toShort = '\0';
+      return;
+      }
+   bool singleByte;
+   const char *cs = getCharacterTable(from, len, &singleByte);
   for (int i = 0; i < len; i++) {
      if (    ((' ' <= *from) && (*from <= '~'))
           || (*from == '\n')
@ -312,6 +464,14 @@ void String::decodeText(char *buffer, char *shortVersion, int sizeBuffer, int si
   }
   *to = '\0';
   *toShort = '\0';
+   if (!singleByte || !SystemCharacterTableIsSingleByte) {
+      char convBuffer[sizeBuffer];
+      if (convertCharacterTable(buffer, strlen(buffer), convBuffer, sizeof(convBuffer), cs))
+         strncpy(buffer, convBuffer, strlen(convBuffer) + 1);
+      char convShortVersion[sizeShortVersion];
+      if (convertCharacterTable(shortVersion, strlen(shortVersion), convShortVersion, sizeof(convShortVersion), cs))
+         strncpy(shortVersion, convShortVersion, strlen(convShortVersion) + 1);
+   }
 }

 Descriptor *Descriptor::getDescriptor(CharArray da, DescriptorTagDomain domain, bool returnUnimplemetedDescriptor) {
--- a/libsi/si.h
+++ b/libsi/si.h
@ -6,7 +6,7 @@
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
- *   $Id: si.h 1.16 2007/02/03 11:47:25 kls Exp $
+ *   $Id: si.h 1.17 2007/04/22 13:32:09 kls Exp $
 *                                                                         *
 ***************************************************************************/

@ -486,6 +486,11 @@ protected:
   void decodeText(char *buffer, char *shortVersion, int sizeBuffer, int sizeShortVersion);
 };

+// Call this function to set the system character table. CharacterTable is a string
+// like "iso8859-15" or "utf-8" (case insensitive).
+// Returns true if the character table was recognized.
+bool SetSystemCharacterTable(const char *CharacterTable);
+
 } //end of namespace

 #endif //LIBSI_SI_H
--- a/vdr.c
+++ b/vdr.c
@ -22,7 +22,7 @@
 *
 * The project's page is at http://www.cadsoft.de/vdr
 *
- * $Id: vdr.c 1.286 2007/03/10 13:00:22 kls Exp $
+ * $Id: vdr.c 1.287 2007/04/22 13:28:32 kls Exp $
 */

 #include <getopt.h>
@ -455,15 +455,6 @@ int main(int argc, char *argv[])
     return 0;
     }

-  // Check for UTF-8 and exit if present - asprintf() will fail if it encounters 8 bit ASCII codes
-  char *LangEnv;
-  if ((LangEnv = getenv("LANG"))     != NULL && strcasestr(LangEnv, "utf") ||
-      (LangEnv = getenv("LC_ALL"))   != NULL && strcasestr(LangEnv, "utf") ||
-      (LangEnv = getenv("LC_CTYPE")) != NULL && strcasestr(LangEnv, "utf")) {
-     fprintf(stderr, "vdr: please turn off UTF-8 before starting VDR\n");
-     return 2;
-     }
-
  // Log file:

  if (SysLogLevel > 0)
@ -500,6 +491,18 @@ int main(int argc, char *argv[])
     dsyslog("running as daemon (tid=%d)", cThread::ThreadId());
  cThread::SetMainThreadId();

+  // Set the system character table:
+
+  char *LangEnv = getenv("LANG");
+  if (LangEnv) {
+     char *CodeSet = strchr(LangEnv, '.');
+     if (CodeSet) {
+        CodeSet++; // skip the dot
+        bool known = SI::SetSystemCharacterTable(CodeSet);
+        isyslog("codeset is '%s' - %s", CodeSet, known ? "known" : "unknown");
+        }
+     }
+
  // Main program loop variables - need to be here to have them initialized before any EXIT():

  cOsdObject *Menu = NULL;