Search notes:

Convert bytes to characters in different code pages

The following simple C source code converts the bytes between 128 to 255 to selected code pages and writes an HTML file with a HTML table where these can be looked up vertically (same byte) and horizontally (same code page).
The result (written output) is here.

byte-2-char.c

//
//  See
//     https://renenyffenegger.ch/notes/Windows/development/code-page/byte-2-char
//     Output at: https://renenyffenegger.ch/development/Windows/code-page/byte-2-char.html
//

#include <windows.h>
#include <stdio.h>



int main() {

    typedef struct {
      UINT    id  ;
      char   *name;
    } CodePage;

    CodePage codePages[] = {
          37, "IBM EBCDIC US-Canada",
         437, "OEM United States"   ,
         500, "IBM EBCDIC International",
    //   708, "Arabic (ASMO 708)",
    //   709, "Arabic (ASMO-449+, BCON V4)",
    //   710, "Arabic - Transparent Arabic",
    //   720, "Arabic (Transparent ASMO); Arabic (DOS)",
         737, "OEM Greek (formerly 437G); Greek (DOS)",
    //   775, "OEM Baltic; Baltic (DOS)",
         850, "OEM Multilingual Latin 1; Western European (DOS)",
         852, "OEM Latin 2; Central European (DOS)",
    //   855, "OEM Cyrillic (primarily Russian)",
    //   857, "OEM Turkish; Turkish (DOS)",
         858, "OEM Multilingual Latin 1 + Euro symbol",
    //   860, "OEM Portuguese; Portuguese (DOS)",
    //   861, "OEM Icelandic; Icelandic (DOS)",
         862, "OEM Hebrew; Hebrew (DOS)",
    //   863, "OEM French Canadian; French Canadian (DOS)",
    //   864, "OEM Arabic; Arabic (864)",
         865, "OEM Nordic; Nordic (DOS)",
    //   866, "OEM Russian; Cyrillic (DOS)",
    //   869, "OEM Modern Greek; Greek, Modern (DOS)",
    //   870, "IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2",
    //   874, "ANSI/OEM Thai (ISO 8859-11); Thai (Windows)",
    //   875, "IBM EBCDIC Greek Modern",
    //   932, "ANSI/OEM Japanese; Japanese (Shift-JIS)",
    //   936, "ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)",
    //   949, "ANSI/OEM Korean (Unified Hangul Code)",
    //   950, "ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)",
    //  1026, "IBM EBCDIC Turkish (Latin 5)",
    //  1047, "IBM EBCDIC Latin 1/Open System",
    //  1140, "IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)",
    //  1141, "IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)",
    //  1142, "IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)",
    //  1143, "IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)",
    //  1144, "IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)",
    //  1145, "IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)",
    //  1146, "IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)",
    //  1147, "IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)",
    //  1148, "IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)",
    //  1149, "IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)",
    //  1200, "Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications",
    //  1201, "Unicode UTF-16, big endian byte order; available only to managed applications",
        1250, "ANSI Central European; Central European (Windows)",
        1251, "ANSI Cyrillic; Cyrillic (Windows)",
        1252, "ANSI Latin 1; Western European (Windows)",
    //  1253, "ANSI Greek; Greek (Windows)",
    //  1254, "ANSI Turkish; Turkish (Windows)",
        1255, "ANSI Hebrew; Hebrew (Windows)",
    //  1256, "ANSI Arabic; Arabic (Windows)",
    //  1257, "ANSI Baltic; Baltic (Windows)",
    //  1258, "ANSI/OEM Vietnamese; Vietnamese (Windows)",
    //  1361, "Korean (Johab)",
       10000, "MAC Roman; Western European (Mac)",
    // 10001, "Japanese (Mac)",
    // 10002, "MAC Traditional Chinese (Big5); Chinese Traditional (Mac)",
    // 10003, "Korean (Mac)",
    // 10004, "Arabic (Mac)",
    // 10005, "Hebrew (Mac)",
    // 10006, "Greek (Mac)",
    // 10007, "Cyrillic (Mac)",
    // 10008, "MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)",
    // 10010, "Romanian (Mac)",
    // 10017, "Ukrainian (Mac)",
    // 10021, "Thai (Mac)",
    // 10029, "MAC Latin 2; Central European (Mac)",
    // 10079, "Icelandic (Mac)",
    // 10081, "Turkish (Mac)",
    // 10082, "Croatian (Mac)",
    // 12000, "Unicode UTF-32, little endian byte order; available only to managed applications",
    // 12001, "Unicode UTF-32, big endian byte order; available only to managed applications",
    // 20000, "CNS Taiwan; Chinese Traditional (CNS)",
    // 20001, "TCA Taiwan",
    // 20002, "Eten Taiwan; Chinese Traditional (Eten)",
    // 20003, "IBM5550 Taiwan",
    // 20004, "TeleText Taiwan",
    // 20005, "Wang Taiwan",
       20105, "IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)",
       20106, "IA5 German (7-bit)",
    // 20107, "IA5 Swedish (7-bit)",
    // 20108, "IA5 Norwegian (7-bit)",
       20127, "US-ASCII (7-bit)",
       20261, "T.61",
       20269, "ISO 6937 Non-Spacing Accent",
    // 20273, "IBM EBCDIC Germany",
    // 20277, "IBM EBCDIC Denmark-Norway",
    // 20278, "IBM EBCDIC Finland-Sweden",
    // 20280, "IBM EBCDIC Italy",
    // 20284, "IBM EBCDIC Latin America-Spain",
    // 20285, "IBM EBCDIC United Kingdom",
    // 20290, "IBM EBCDIC Japanese Katakana Extended",
    // 20297, "IBM EBCDIC France",
    // 20420, "IBM EBCDIC Arabic",
    // 20423, "IBM EBCDIC Greek",
    // 20424, "IBM EBCDIC Hebrew",
    // 20833, "IBM EBCDIC Korean Extended",
    // 20838, "IBM EBCDIC Thai",
    // 20866, "Russian (KOI8-R); Cyrillic (KOI8-R)",
    // 20871, "IBM EBCDIC Icelandic",
    // 20880, "IBM EBCDIC Cyrillic Russian",
    // 20905, "IBM EBCDIC Turkish",
    // 20924, "IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)",
    // 20932, "Japanese (JIS 0208-1990 and 0212-1990)",
    // 20936, "Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)",
    // 20949, "Korean Wansung",
    // 21025, "IBM EBCDIC Cyrillic Serbian-Bulgarian",
    // 21866, "Ukrainian (KOI8-U); Cyrillic (KOI8-U)",
       28591, "ISO 8859-1 Latin 1; Western European (ISO)",
       28592, "ISO 8859-2 Central European; Central European (ISO)",
       28593, "ISO 8859-3 Latin 3",
    // 28594, "ISO 8859-4 Baltic",
    // 28595, "ISO 8859-5 Cyrillic",
    // 28596, "ISO 8859-6 Arabic",
       28597, "ISO 8859-7 Greek",
    // 28598, "ISO 8859-8 Hebrew; Hebrew (ISO-Visual)",
    // 28599, "ISO 8859-9 Turkish",
    // 28603, "ISO 8859-13 Estonian",
       28605, "ISO 8859-15 Latin 9",
       29001, "Europa 3",
    // 38598, "ISO 8859-8 Hebrew; Hebrew (ISO-Logical)",
    // 50220, "ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)",
    // 50221, "ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)",
    // 50222, "ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)",
    // 50225, "ISO 2022 Korean",
    // 50227, "ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)",
    // 50229, "ISO 2022 Traditional Chinese",
    // 50930, "EBCDIC Japanese (Katakana) Extended",
    // 50931, "EBCDIC US-Canada and Japanese",
    // 50933, "EBCDIC Korean Extended and Korean",
    // 50935, "EBCDIC Simplified Chinese Extended and Simplified Chinese",
    // 50936, "EBCDIC Simplified Chinese",
    // 50937, "EBCDIC US-Canada and Traditional Chinese",
    // 50939, "EBCDIC Japanese (Latin) Extended and Japanese",
    // 51932, "EUC Japanese",
    // 51936, "EUC Simplified Chinese; Chinese Simplified (EUC)",
    // 51949, "EUC Korean",
    // 51950, "EUC Traditional Chinese",
    // 52936, "HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)",
    // 54936, "Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)",
    // 57002, "ISCII Devanagari",
    // 57003, "ISCII Bangla",
    // 57004, "ISCII Tamil",
    // 57005, "ISCII Telugu",
    // 57006, "ISCII Assamese",
    // 57007, "ISCII Odia",
    // 57008, "ISCII Kannada",
    // 57009, "ISCII Malayalam",
    // 57010, "ISCII Gujarati",
    // 57011, "ISCII Punjabi",
    };

    FILE *html;

    int  i;
    unsigned char charIn[2];
    char utf8[5]; // According to RFC 3629, the maximum length of utf8 character is 4 bytes.
    WCHAR wc [2];


//  fprintf(html, "CP_ACP        = %d\n", CP_ACP       ); // 0
//  fprintf(html, "CP_OEMCP      = %d\n", CP_OEMCP     ); // 2
//  fprintf(html, "CP_MACCP      = %d\n", CP_MACCP     ); // 1
//  fprintf(html, "CP_THREAD_ACP = %d\n", CP_THREAD_ACP); // 42
//  fprintf(html, "CP_SYMBOL     = %d\n", CP_SYMBOL    ); // 3
//  fprintf(html, "CP_UTF7       = %d\n", CP_UTF7      ); // 65000
//  fprintf(html, "CP_UTF8       = %d\n", CP_UTF8      ); // 65001


    html = fopen("byte-2-char.html", "w");

    fprintf(html, "<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body><table border=1>\n");

    fprintf(html, "<tr><td></td>");
    for (i = 0; i<sizeof(codePages)/sizeof(CodePage); i++) {
      fprintf(html, "<td>%d</td>", codePages[i].id); // , codePages[i].name);
    }
    fprintf(html, "</tr>\n");

    charIn[1] = 0;
    for (charIn[0] = 128; charIn[0] >0; charIn[0]++) {

      fprintf(html, "<tr><td>%3d:</td>", charIn[0]);

      for (i = 0; i<sizeof(codePages)/sizeof(CodePage); i++) {

        MultiByteToWideChar(
          codePages[i].id,
          0              , // dwFlags
          charIn         , // lpMultiByteStr
         -1              , // cbMultiByte:  -1 = null terminated
          wc             , // lpWideCharStr
          2                // cchWideChar
        );

        WideCharToMultiByte(
          CP_UTF8        , // CodePage
          0              , // dwFlags
          wc             , // lpWideCharStr
         -1              , // cchWideChar    : -1: null terminated
         &utf8           , // lpMultiByteStr
          5              , // cbMultiByte
          NULL           , // lpDefaultChar
          NULL);

         fprintf(html, "<td>%s</td>", utf8);

      }
      fprintf(html, "</tr>\n");

    }

    fprintf(html, "</table></body></html>");

}
Github repository Windows-development, path: /code-page/byte-2-char.c

Index