--- utf.lua
-- see: https://williamaadams.wordpress.com/2012/06/16/messing-around-with-utf-8-in-luajit/
-- test https://github.com/Stepets/utf8.lua and compare performance to lua-utf8
-- @module utf
local utf = {}

-- local util = require "util" -- must NOT load ANY other module, this is needed in util.lua
local ffi = require "mffi"
local C = ffi.C
local util -- = require "util"
-- local utf8 = {} -- = require "utf8"

local byte = string.byte
local char = string.char
local concat = table.concat

local band = bit.band
local bor = bit.bor
local rshift = bit.rshift

function utf.fixUTF8(s, replacement)
	local p, len, invalid = 1, #s, {}
	while p <= len do
		if p == s:find("[%z\1-\127]", p) then
			p = p + 1
		elseif p == s:find("[\194-\223][\128-\191]", p) then
			p = p + 2
		elseif p == s:find("\224[\160-\191][\128-\191]", p) or p == s:find("[\225-\236][\128-\191][\128-\191]", p) or p == s:find("\237[\128-\159][\128-\191]", p) or p == s:find("[\238-\239][\128-\191][\128-\191]", p) then
			p = p + 3
		elseif p == s:find("\240[\144-\191][\128-\191][\128-\191]", p) or p == s:find("[\241-\243][\128-\191][\128-\191][\128-\191]", p) or p == s:find("\244[\128-\143][\128-\191][\128-\191]", p) then
			p = p + 4
		else
			s = s:sub(1, p - 1) .. replacement .. s:sub(p + 1)
			table.insert(invalid, p)
		end
	end
	return s, invalid
end

function utf.next(s, i)
	--[[
	if not i then
		if #s == 0 then
			return nil
		end
		return 1, true --fake flag (doesn't matter since this flag is not to be taken as full validation)
	end
	]]
	if s == nil or i > #s then
		return
	end
	local c = s:byte(i)
	if c >= 0x00 and c <= 0x7F then
		i = i + 1
	elseif c >= 0xC2 and c <= 0xDF then
		i = i + 2
	elseif c >= 0xE0 and c <= 0xEF then
		i = i + 3
	elseif c >= 0xF0 and c <= 0xF4 then
		i = i + 4
	else -- invalid
		return i + 1, false
	end
	-- if i > #s then
	--	return
	-- end
	return i, true
end

-- iterate chars, returning the byte index where each char starts
function utf.iter(s, previ)
	return utf.next, s, previ
end

-- Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
-- local utf8ReplacementCharHex = 0xEFBFBD
local utf8ReplacementChar = char(0xEF, 0xBF, 0xBD)

function utf.codepointToUtf8(codepoint)
	--  https://en.wikipedia.org/wiki/UTF-8
	if codepoint >= 0x00 and codepoint <= 0x007F then --  U+0000 to U+0x007F
		return char(codepoint)
	elseif codepoint >= 0x0080 and codepoint <= 0x07FF then --  U+0x0080 to U+0x07FF
		-- 110xxxxx	10xxxxxxx
		local b1 = bor(rshift(codepoint, 6), 0xC0)
		local b2 = bor(band(codepoint, 0x3F), 0x80)
		return char(b1, b2)
	elseif codepoint >= 0x0800 and codepoint <= 0xFFFF then
		-- 1110xxxx	10xxxxxx	10xxxxxx
		-- € = E2=226 82=130 AC=172 = U+20AC
		-- http://www.fileformat.info/info/unicode/char/20AC/index.htm
		local b1 = bor(rshift(codepoint, 12), 0xE0)
		local b2 = bor(band(rshift(codepoint, 6), 0x3F), 0x80)
		local b3 = bor(band(codepoint, 0x3F), 0x80)
		return char(b1, b2, b3)
	elseif codepoint >= 0x10000 and codepoint <= 0x1FFFFF then
		-- Code points U+D800 to U+DFFF and code points U+10000 to U+10FFFF
		-- 'MUSICAL SYMBOL G CLEF' (U+1D11E)
		-- http://www.fileformat.info/info/unicode/char/1D11E/index.htm
		local b1 = bor(rshift(codepoint, 18), 0xF0)
		local b2 = bor(band(rshift(codepoint, 12), 0x3F), 0x80)
		local b3 = bor(band(rshift(codepoint, 6), 0x3F), 0x80)
		local b4 = bor(band(codepoint, 0x3F), 0x80)
		return char(b1, b2, b3, b4)
	end
	-- elseif codepoint >= 0xD800 and codepoint <= 0xDFFF then -- surrogates
	return utf8ReplacementChar
end

--[[
function utf8ToCodepoint(utf8Char)

end
]]

--- changing the encoding from ISO-8859-15 to UTF-8.
-- http://en.wikipedia.org/wiki/ISO/IEC_8859-15
-- http://www.columbia.edu/kermit/latin9.html
function utf.latin9ToUtf8(latin9String)
	local result = {}
	local code
	if type(latin9String) ~= "string" then
		if latin9String == nil then
			return -- empty print
		end
		if util == nil then
			util = require "util"
		end
		util.printError("utf.latin9ToUtf8() parameter type '%s' is not a string, returning empty string", type(latin9String))
		return ""
	end
	local sourceLen = #latin9String
	local index = 1
	local j = 0
	while index <= sourceLen do
		j = j + 1
		code = byte(latin9String, index)
		if code < 128 then
			result[j] = char(code)
		elseif code < 192 then
			if code == 164 then
				result[j] = char(226)
				j = j + 1
				result[j] = char(130)
				j = j + 1
				result[j] = char(172)
			elseif code == 166 then
				result[j] = char(197)
				j = j + 1
				result[j] = char(160)
			elseif code == 168 then
				result[j] = char(197)
				j = j + 1
				result[j] = char(161)
			elseif code == 180 then
				result[j] = char(197)
				j = j + 1
				result[j] = char(189)
			elseif code == 184 then
				result[j] = char(197)
				j = j + 1
				result[j] = char(190)
			elseif code == 188 then
				result[j] = char(197)
				j = j + 1
				result[j] = char(146)
			elseif code == 189 then
				result[j] = char(197)
				j = j + 1
				result[j] = char(147)
			elseif code == 190 then
				result[j] = char(197)
				j = j + 1
				result[j] = char(184)
			else
				result[j] = char(194)
				if index == sourceLen then
					j = j + 1
					result[j] = char(code)
				end
			end
		else
			result[j] = char(195)
			j = j + 1
			result[j] = char(code - 64)
		end
		index = index + 1
	end
	return concat(result)
end

--[[
	 Create a dynamically allocated copy of string,
	 changing the encoding from UTF-8 to ISO-8859-15.
	 Unsupported code points are ignored.
]]

local buffSize = 2048
local buf_c
function utf.utf8ToLatin9(utf8string)
	local sourceLen = #utf8string
	if sourceLen < 1 then
		return ""
	end
	local result = {}
	local cstr
	if sourceLen < buffSize then -- must be smaller than because c-string end \0
		if buf_c == nil then
			buf_c = ffi.createBuffer(buffSize) -- ffi.newNoAnchor("uint8_t[?]",  buffSize)
		end
		cstr = ffi.copyStringToBuffer(buf_c, utf8string, sourceLen)
	else
		cstr = utf8string
	end
	local s

	local pos = 0
	local used = 0
	while pos < sourceLen do -- converted from C-source
		s = ffi.cast("uint8_t*", cstr + pos)
		if s[0] < 128 then
			used = used + 1
			result[used] = char(s[0])
			pos = pos + 1
		elseif s[0] == 226 and s[1] == 130 and s[2] == 172 then
			used = used + 1
			result[used] = char(164)
			pos = pos + 3
		elseif s[0] == 194 and s[1] >= 128 and s[1] <= 191 then
			used = used + 1
			result[used] = char(s[1])
			pos = pos + 2
		elseif s[0] == 195 and s[1] >= 128 and s[1] <= 191 then
			used = used + 1
			result[used] = char(s[1] + 64)
			pos = pos + 2
		elseif s[0] == 197 and s[1] == 160 then
			used = used + 1
			result[used] = char(166)
			pos = pos + 2
		elseif s[0] == 197 and s[1] == 161 then
			used = used + 1
			result[used] = char(168)
			pos = pos + 2
		elseif s[0] == 197 and s[1] == 189 then
			used = used + 1
			result[used] = char(180)
			pos = pos + 2
		elseif s[0] == 197 and s[1] == 190 then
			used = used + 1
			result[used] = char(184)
			pos = pos + 2
		elseif s[0] == 197 and s[1] == 146 then
			used = used + 1
			result[used] = char(188)
			pos = pos + 2
		elseif s[0] == 197 and s[1] == 147 then
			used = used + 1
			result[used] = char(189)
			pos = pos + 2
		elseif s[0] == 197 and s[1] == 184 then
			used = used + 1
			result[used] = char(190)
			pos = pos + 2
			-- rest are unsupported codepoints, they are removed from the result
		elseif s[0] >= 192 and s[0] < 224 and s[1] >= 128 and s[1] < 192 then
			pos = pos + 2
		elseif s[0] >= 224 and s[0] < 240 and s[1] >= 128 and s[1] < 192 and s[2] >= 128 and s[2] < 192 then
			pos = pos + 3
		elseif s[0] >= 240 and s[0] < 248 and s[1] >= 128 and s[1] < 192 and s[2] >= 128 and s[2] < 192 and s[3] >= 128 and s[3] < 192 then
			pos = pos + 4
		elseif s[0] >= 248 and s[0] < 252 and s[1] >= 128 and s[1] < 192 and s[2] >= 128 and s[2] < 192 and s[3] >= 128 and s[3] < 192 and s[4] >= 128 and s[4] < 192 then
			pos = pos + 5
		elseif s[0] >= 252 and s[0] < 254 and s[1] >= 128 and s[1] < 192 and s[2] >= 128 and s[2] < 192 and s[3] >= 128 and s[3] < 192 and s[4] >= 128 and s[4] < 192 and s[5] >= 128 and s[5] < 192 then
			pos = pos + 6
		else
			pos = pos + 1
		end
	end
	return concat(result)
end

local function codepageTableFill(name, convertStr, codepageTbl, utf8Tbl)
	for line in convertStr:gmatch("[^\r\n]+") do
		local rec = {}
		rec.cp = tonumber(line:sub(1, 4))
		rec.charName = line:sub(17)
		local utf8Str = line:sub(8, 13)
		rec.cp = char(rec.cp)
		if utf8Str == "      " then
			rec.utf8 = utf8ReplacementChar -- is this ok?
		else
			rec.utf8 = tonumber(utf8Str)
			rec.utf8 = utf.codepointToUtf8(rec.utf8)
			-- if rec.cp == rec.utf8 then
			-- print(name..", concatenate codepage table: ", rec.cp, rec.utf8, utf8Str, rec.charName)
			-- else
			codepageTbl[rec.cp] = rec
			utf8Tbl[rec.utf8] = rec
		end
	end
end

--[[
http://www.cp1252.com
#
#    Name:     cp1252 to Unicode table
#    Unicode version: 2.0
#    Table version: 2.01
#    Table format:  Format A
#    Date:          04/15/98
#
#    Contact:       Shawn.Steele@microsoft.com
#
#    General notes: none
#
#    Format: Three tab-separated columns
#        Column #1 is the cp1252 code (in hex)
#        Column #2 is the Unicode (in hex as 0xXXXX)
#        Column #3 is the Unicode name (follows a comment sign, '#')
#
#    The entries are in cp1252 order
#
--]]

local cp1252ConvertStr = [[
0x00   0x0000   #NULL
0x01   0x0001   #START OF HEADING
0x02   0x0002   #START OF TEXT
0x03   0x0003   #END OF TEXT
0x04   0x0004   #END OF TRANSMISSION
0x05   0x0005   #ENQUIRY
0x06   0x0006   #ACKNOWLEDGE
0x07   0x0007   #BELL
0x08   0x0008   #BACKSPACE
0x09   0x0009   #HORIZONTAL TABULATION
0x0A   0x000A   #LINE FEED
0x0B   0x000B   #VERTICAL TABULATION
0x0C   0x000C   #FORM FEED
0x0D   0x000D   #CARRIAGE RETURN
0x0E   0x000E   #SHIFT OUT
0x0F   0x000F   #SHIFT IN
0x10   0x0010   #DATA LINK ESCAPE
0x11   0x0011   #DEVICE CONTROL ONE
0x12   0x0012   #DEVICE CONTROL TWO
0x13   0x0013   #DEVICE CONTROL THREE
0x14   0x0014   #DEVICE CONTROL FOUR
0x15   0x0015   #NEGATIVE ACKNOWLEDGE
0x16   0x0016   #SYNCHRONOUS IDLE
0x17   0x0017   #END OF TRANSMISSION BLOCK
0x18   0x0018   #CANCEL
0x19   0x0019   #END OF MEDIUM
0x1A   0x001A   #SUBSTITUTE
0x1B   0x001B   #ESCAPE
0x1C   0x001C   #FILE SEPARATOR
0x1D   0x001D   #GROUP SEPARATOR
0x1E   0x001E   #RECORD SEPARATOR
0x1F   0x001F   #UNIT SEPARATOR
0x20   0x0020   #SPACE
0x21   0x0021   #EXCLAMATION MARK
0x22   0x0022   #QUOTATION MARK
0x23   0x0023   #NUMBER SIGN
0x24   0x0024   #DOLLAR SIGN
0x25   0x0025   #PERCENT SIGN
0x26   0x0026   #AMPERSAND
0x27   0x0027   #APOSTROPHE
0x28   0x0028   #LEFT PARENTHESIS
0x29   0x0029   #RIGHT PARENTHESIS
0x2A   0x002A   #ASTERISK
0x2B   0x002B   #PLUS SIGN
0x2C   0x002C   #COMMA
0x2D   0x002D   #HYPHEN-MINUS
0x2E   0x002E   #FULL STOP
0x2F   0x002F   #SOLIDUS
0x30   0x0030   #DIGIT ZERO
0x31   0x0031   #DIGIT ONE
0x32   0x0032   #DIGIT TWO
0x33   0x0033   #DIGIT THREE
0x34   0x0034   #DIGIT FOUR
0x35   0x0035   #DIGIT FIVE
0x36   0x0036   #DIGIT SIX
0x37   0x0037   #DIGIT SEVEN
0x38   0x0038   #DIGIT EIGHT
0x39   0x0039   #DIGIT NINE
0x3A   0x003A   #COLON
0x3B   0x003B   #SEMICOLON
0x3C   0x003C   #LESS-THAN SIGN
0x3D   0x003D   #EQUALS SIGN
0x3E   0x003E   #GREATER-THAN SIGN
0x3F   0x003F   #QUESTION MARK
0x40   0x0040   #COMMERCIAL AT
0x41   0x0041   #LATIN CAPITAL LETTER A
0x42   0x0042   #LATIN CAPITAL LETTER B
0x43   0x0043   #LATIN CAPITAL LETTER C
0x44   0x0044   #LATIN CAPITAL LETTER D
0x45   0x0045   #LATIN CAPITAL LETTER E
0x46   0x0046   #LATIN CAPITAL LETTER F
0x47   0x0047   #LATIN CAPITAL LETTER G
0x48   0x0048   #LATIN CAPITAL LETTER H
0x49   0x0049   #LATIN CAPITAL LETTER I
0x4A   0x004A   #LATIN CAPITAL LETTER J
0x4B   0x004B   #LATIN CAPITAL LETTER K
0x4C   0x004C   #LATIN CAPITAL LETTER L
0x4D   0x004D   #LATIN CAPITAL LETTER M
0x4E   0x004E   #LATIN CAPITAL LETTER N
0x4F   0x004F   #LATIN CAPITAL LETTER O
0x50   0x0050   #LATIN CAPITAL LETTER P
0x51   0x0051   #LATIN CAPITAL LETTER Q
0x52   0x0052   #LATIN CAPITAL LETTER R
0x53   0x0053   #LATIN CAPITAL LETTER S
0x54   0x0054   #LATIN CAPITAL LETTER T
0x55   0x0055   #LATIN CAPITAL LETTER U
0x56   0x0056   #LATIN CAPITAL LETTER V
0x57   0x0057   #LATIN CAPITAL LETTER W
0x58   0x0058   #LATIN CAPITAL LETTER X
0x59   0x0059   #LATIN CAPITAL LETTER Y
0x5A   0x005A   #LATIN CAPITAL LETTER Z
0x5B   0x005B   #LEFT SQUARE BRACKET
0x5C   0x005C   #REVERSE SOLIDUS
0x5D   0x005D   #RIGHT SQUARE BRACKET
0x5E   0x005E   #CIRCUMFLEX ACCENT
0x5F   0x005F   #LOW LINE
0x60   0x0060   #GRAVE ACCENT
0x61   0x0061   #LATIN SMALL LETTER A
0x62   0x0062   #LATIN SMALL LETTER B
0x63   0x0063   #LATIN SMALL LETTER C
0x64   0x0064   #LATIN SMALL LETTER D
0x65   0x0065   #LATIN SMALL LETTER E
0x66   0x0066   #LATIN SMALL LETTER F
0x67   0x0067   #LATIN SMALL LETTER G
0x68   0x0068   #LATIN SMALL LETTER H
0x69   0x0069   #LATIN SMALL LETTER I
0x6A   0x006A   #LATIN SMALL LETTER J
0x6B   0x006B   #LATIN SMALL LETTER K
0x6C   0x006C   #LATIN SMALL LETTER L
0x6D   0x006D   #LATIN SMALL LETTER M
0x6E   0x006E   #LATIN SMALL LETTER N
0x6F   0x006F   #LATIN SMALL LETTER O
0x70   0x0070   #LATIN SMALL LETTER P
0x71   0x0071   #LATIN SMALL LETTER Q
0x72   0x0072   #LATIN SMALL LETTER R
0x73   0x0073   #LATIN SMALL LETTER S
0x74   0x0074   #LATIN SMALL LETTER T
0x75   0x0075   #LATIN SMALL LETTER U
0x76   0x0076   #LATIN SMALL LETTER V
0x77   0x0077   #LATIN SMALL LETTER W
0x78   0x0078   #LATIN SMALL LETTER X
0x79   0x0079   #LATIN SMALL LETTER Y
0x7A   0x007A   #LATIN SMALL LETTER Z
0x7B   0x007B   #LEFT CURLY BRACKET
0x7C   0x007C   #VERTICAL LINE
0x7D   0x007D   #RIGHT CURLY BRACKET
0x7E   0x007E   #TILDE
0x7F   0x007F   #DELETE
0x80   0x20AC   #EURO SIGN
0x81            #UNDEFINED
0x82   0x201A   #SINGLE LOW-9 QUOTATION MARK
0x83   0x0192   #LATIN SMALL LETTER F WITH HOOK
0x84   0x201E   #DOUBLE LOW-9 QUOTATION MARK
0x85   0x2026   #HORIZONTAL ELLIPSIS
0x86   0x2020   #DAGGER
0x87   0x2021   #DOUBLE DAGGER
0x88   0x02C6   #MODIFIER LETTER CIRCUMFLEX ACCENT
0x89   0x2030   #PER MILLE SIGN
0x8A   0x0160   #LATIN CAPITAL LETTER S WITH CARON
0x8B   0x2039   #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x8C   0x0152   #LATIN CAPITAL LIGATURE OE
0x8D            #UNDEFINED
0x8E   0x017D   #LATIN CAPITAL LETTER Z WITH CARON
0x8F            #UNDEFINED
0x90            #UNDEFINED
0x91   0x2018   #LEFT SINGLE QUOTATION MARK
0x92   0x2019   #RIGHT SINGLE QUOTATION MARK
0x93   0x201C   #LEFT DOUBLE QUOTATION MARK
0x94   0x201D   #RIGHT DOUBLE QUOTATION MARK
0x95   0x2022   #BULLET
0x96   0x2013   #EN DASH
0x97   0x2014   #EM DASH
0x98   0x02DC   #SMALL TILDE
0x99   0x2122   #TRADE MARK SIGN
0x9A   0x0161   #LATIN SMALL LETTER S WITH CARON
0x9B   0x203A   #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x9C   0x0153   #LATIN SMALL LIGATURE OE
0x9D            #UNDEFINED
0x9E   0x017E   #LATIN SMALL LETTER Z WITH CARON
0x9F   0x0178   #LATIN CAPITAL LETTER Y WITH DIAERESIS
0xA0   0x00A0   #NO-BREAK SPACE
0xA1   0x00A1   #INVERTED EXCLAMATION MARK
0xA2   0x00A2   #CENT SIGN
0xA3   0x00A3   #POUND SIGN
0xA4   0x00A4   #CURRENCY SIGN
0xA5   0x00A5   #YEN SIGN
0xA6   0x00A6   #BROKEN BAR
0xA7   0x00A7   #SECTION SIGN
0xA8   0x00A8   #DIAERESIS
0xA9   0x00A9   #COPYRIGHT SIGN
0xAA   0x00AA   #FEMININE ORDINAL INDICATOR
0xAB   0x00AB   #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
0xAC   0x00AC   #NOT SIGN
0xAD   0x00AD   #SOFT HYPHEN
0xAE   0x00AE   #REGISTERED SIGN
0xAF   0x00AF   #MACRON
0xB0   0x00B0   #DEGREE SIGN
0xB1   0x00B1   #PLUS-MINUS SIGN
0xB2   0x00B2   #SUPERSCRIPT TWO
0xB3   0x00B3   #SUPERSCRIPT THREE
0xB4   0x00B4   #ACUTE ACCENT
0xB5   0x00B5   #MICRO SIGN
0xB6   0x00B6   #PILCROW SIGN
0xB7   0x00B7   #MIDDLE DOT
0xB8   0x00B8   #CEDILLA
0xB9   0x00B9   #SUPERSCRIPT ONE
0xBA   0x00BA   #MASCULINE ORDINAL INDICATOR
0xBB   0x00BB   #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
0xBC   0x00BC   #VULGAR FRACTION ONE QUARTER
0xBD   0x00BD   #VULGAR FRACTION ONE HALF
0xBE   0x00BE   #VULGAR FRACTION THREE QUARTERS
0xBF   0x00BF   #INVERTED QUESTION MARK
0xC0   0x00C0   #LATIN CAPITAL LETTER A WITH GRAVE
0xC1   0x00C1   #LATIN CAPITAL LETTER A WITH ACUTE
0xC2   0x00C2   #LATIN CAPITAL LETTER A WITH CIRCUMFLEX
0xC3   0x00C3   #LATIN CAPITAL LETTER A WITH TILDE
0xC4   0x00C4   #LATIN CAPITAL LETTER A WITH DIAERESIS
0xC5   0x00C5   #LATIN CAPITAL LETTER A WITH RING ABOVE
0xC6   0x00C6   #LATIN CAPITAL LETTER AE
0xC7   0x00C7   #LATIN CAPITAL LETTER C WITH CEDILLA
0xC8   0x00C8   #LATIN CAPITAL LETTER E WITH GRAVE
0xC9   0x00C9   #LATIN CAPITAL LETTER E WITH ACUTE
0xCA   0x00CA   #LATIN CAPITAL LETTER E WITH CIRCUMFLEX
0xCB   0x00CB   #LATIN CAPITAL LETTER E WITH DIAERESIS
0xCC   0x00CC   #LATIN CAPITAL LETTER I WITH GRAVE
0xCD   0x00CD   #LATIN CAPITAL LETTER I WITH ACUTE
0xCE   0x00CE   #LATIN CAPITAL LETTER I WITH CIRCUMFLEX
0xCF   0x00CF   #LATIN CAPITAL LETTER I WITH DIAERESIS
0xD0   0x00D0   #LATIN CAPITAL LETTER ETH
0xD1   0x00D1   #LATIN CAPITAL LETTER N WITH TILDE
0xD2   0x00D2   #LATIN CAPITAL LETTER O WITH GRAVE
0xD3   0x00D3   #LATIN CAPITAL LETTER O WITH ACUTE
0xD4   0x00D4   #LATIN CAPITAL LETTER O WITH CIRCUMFLEX
0xD5   0x00D5   #LATIN CAPITAL LETTER O WITH TILDE
0xD6   0x00D6   #LATIN CAPITAL LETTER O WITH DIAERESIS
0xD7   0x00D7   #MULTIPLICATION SIGN
0xD8   0x00D8   #LATIN CAPITAL LETTER O WITH STROKE
0xD9   0x00D9   #LATIN CAPITAL LETTER U WITH GRAVE
0xDA   0x00DA   #LATIN CAPITAL LETTER U WITH ACUTE
0xDB   0x00DB   #LATIN CAPITAL LETTER U WITH CIRCUMFLEX
0xDC   0x00DC   #LATIN CAPITAL LETTER U WITH DIAERESIS
0xDD   0x00DD   #LATIN CAPITAL LETTER Y WITH ACUTE
0xDE   0x00DE   #LATIN CAPITAL LETTER THORN
0xDF   0x00DF   #LATIN SMALL LETTER SHARP S
0xE0   0x00E0   #LATIN SMALL LETTER A WITH GRAVE
0xE1   0x00E1   #LATIN SMALL LETTER A WITH ACUTE
0xE2   0x00E2   #LATIN SMALL LETTER A WITH CIRCUMFLEX
0xE3   0x00E3   #LATIN SMALL LETTER A WITH TILDE
0xE4   0x00E4   #LATIN SMALL LETTER A WITH DIAERESIS
0xE5   0x00E5   #LATIN SMALL LETTER A WITH RING ABOVE
0xE6   0x00E6   #LATIN SMALL LETTER AE
0xE7   0x00E7   #LATIN SMALL LETTER C WITH CEDILLA
0xE8   0x00E8   #LATIN SMALL LETTER E WITH GRAVE
0xE9   0x00E9   #LATIN SMALL LETTER E WITH ACUTE
0xEA   0x00EA   #LATIN SMALL LETTER E WITH CIRCUMFLEX
0xEB   0x00EB   #LATIN SMALL LETTER E WITH DIAERESIS
0xEC   0x00EC   #LATIN SMALL LETTER I WITH GRAVE
0xED   0x00ED   #LATIN SMALL LETTER I WITH ACUTE
0xEE   0x00EE   #LATIN SMALL LETTER I WITH CIRCUMFLEX
0xEF   0x00EF   #LATIN SMALL LETTER I WITH DIAERESIS
0xF0   0x00F0   #LATIN SMALL LETTER ETH
0xF1   0x00F1   #LATIN SMALL LETTER N WITH TILDE
0xF2   0x00F2   #LATIN SMALL LETTER O WITH GRAVE
0xF3   0x00F3   #LATIN SMALL LETTER O WITH ACUTE
0xF4   0x00F4   #LATIN SMALL LETTER O WITH CIRCUMFLEX
0xF5   0x00F5   #LATIN SMALL LETTER O WITH TILDE
0xF6   0x00F6   #LATIN SMALL LETTER O WITH DIAERESIS
0xF7   0x00F7   #DIVISION SIGN
0xF8   0x00F8   #LATIN SMALL LETTER O WITH STROKE
0xF9   0x00F9   #LATIN SMALL LETTER U WITH GRAVE
0xFA   0x00FA   #LATIN SMALL LETTER U WITH ACUTE
0xFB   0x00FB   #LATIN SMALL LETTER U WITH CIRCUMFLEX
0xFC   0x00FC   #LATIN SMALL LETTER U WITH DIAERESIS
0xFD   0x00FD   #LATIN SMALL LETTER Y WITH ACUTE
0xFE   0x00FE   #LATIN SMALL LETTER THORN
0xFF   0x00FF   #LATIN SMALL LETTER Y WITH DIAERESIS
]]

-- http://www.cp1252.com
local cp1252ToUtf8ConvertTbl
local utf8ToCp1252ConvertTbl

local function cp1252Init()
	if not cp1252ToUtf8ConvertTbl then
		cp1252ToUtf8ConvertTbl = {}
		utf8ToCp1252ConvertTbl = {}
		codepageTableFill("cp1252", cp1252ConvertStr, cp1252ToUtf8ConvertTbl, utf8ToCp1252ConvertTbl)
	end
end

--[[
function utf.cp1252NumberToUtf8Char(num)
	cp1252Init()
end
]]

function utf.cp1252ToUtf8(str)
	cp1252Init()
	-- http://en.wikipedia.org/wiki/UTF-8
	-- http://winbatch.hpdd.de/MyWbtHelp/htm/20100217.HowTo.ConvertBetweenCodepages.htm
	local ret = {}
	local subStr
	for i = 1, #str do
		subStr = str:sub(i, i)
		if cp1252ToUtf8ConvertTbl[subStr] then
			ret[#ret + 1] = cp1252ToUtf8ConvertTbl[subStr].utf8
		else
			ret[#ret + 1] = char(127) -- utf.latin9ToUtf8(subStr)
		end
	end
	return concat(ret)
end

function utf.utf8ToCp1252(str)
	if not utf8ToCp1252ConvertTbl then
		cp1252Init()
	end
	-- http://en.wikipedia.org/wiki/UTF-8
	-- http://winbatch.hpdd.de/MyWbtHelp/htm/20100217.HowTo.ConvertBetweenCodepages.htm
	local ret = {}
	local subStr
	local prevPos = 1
	local errCount = 0
	for i, valid in utf.iter(str, prevPos) do
		if valid then
			subStr = str:sub(prevPos, i - 1)
			if utf8ToCp1252ConvertTbl[subStr] then
				ret[#ret + 1] = utf8ToCp1252ConvertTbl[subStr].cp
			else
				ret[#ret + 1] = char(26) -- utf.utf8ToLatin9(subStr) -- convert multibyte to single-byte
			end
		else
			errCount = errCount + 1
			if errCount <= 10 then
				if util == nil then
					util = require "util"
				end
				util.print("utf8ToCp1252, invalid char in position %d - %d", prevPos, i - 1)
			end
		end
		prevPos = i
	end
	return concat(ret)
end

--[[
	-- http://winbatch.hpdd.de/MyWbtHelp/htm/20100217.HowTo.ConvertBetweenCodepages.htm
#
#    Name:     cp850_DOSLatin1 to Unicode table
#    Unicode version: 2.0
#    Table version: 2.00
#    Table format:  Format A
#    Date:          04/24/96
#    Contact: Shawn.Steele@microsoft.com
#
#    General notes: none
#
#    Format: Three tab-separated columns
#        Column #1 is the cp850_DOSLatin1 code (in hex)
#        Column #2 is the Unicode (in hex as 0xXXXX)
#        Column #3 is the Unicode name (follows a comment sign, '#')
#
#    The entries are in cp850_DOSLatin1 order
#
]]

local cp850ConvertStr = [[
0x00   0x0000   #NULL
0x01   0x0001   #START OF HEADING
0x02   0x0002   #START OF TEXT
0x03   0x0003   #END OF TEXT
0x04   0x0004   #END OF TRANSMISSION
0x05   0x0005   #ENQUIRY
0x06   0x0006   #ACKNOWLEDGE
0x07   0x0007   #BELL
0x08   0x0008   #BACKSPACE
0x09   0x0009   #HORIZONTAL TABULATION
0x0a   0x000a   #LINE FEED
0x0b   0x000b   #VERTICAL TABULATION
0x0c   0x000c   #FORM FEED
0x0d   0x000d   #CARRIAGE RETURN
0x0e   0x000e   #SHIFT OUT
0x0f   0x000f   #SHIFT IN
0x10   0x0010   #DATA LINK ESCAPE
0x11   0x0011   #DEVICE CONTROL ONE
0x12   0x0012   #DEVICE CONTROL TWO
0x13   0x0013   #DEVICE CONTROL THREE
0x14   0x0014   #DEVICE CONTROL FOUR
0x15   0x0015   #NEGATIVE ACKNOWLEDGE
0x16   0x0016   #SYNCHRONOUS IDLE
0x17   0x0017   #END OF TRANSMISSION BLOCK
0x18   0x0018   #CANCEL
0x19   0x0019   #END OF MEDIUM
0x1a   0x001a   #SUBSTITUTE
0x1b   0x001b   #ESCAPE
0x1c   0x001c   #FILE SEPARATOR
0x1d   0x001d   #GROUP SEPARATOR
0x1e   0x001e   #RECORD SEPARATOR
0x1f   0x001f   #UNIT SEPARATOR
0x20   0x0020   #SPACE
0x21   0x0021   #EXCLAMATION MARK
0x22   0x0022   #QUOTATION MARK
0x23   0x0023   #NUMBER SIGN
0x24   0x0024   #DOLLAR SIGN
0x25   0x0025   #PERCENT SIGN
0x26   0x0026   #AMPERSAND
0x27   0x0027   #APOSTROPHE
0x28   0x0028   #LEFT PARENTHESIS
0x29   0x0029   #RIGHT PARENTHESIS
0x2a   0x002a   #ASTERISK
0x2b   0x002b   #PLUS SIGN
0x2c   0x002c   #COMMA
0x2d   0x002d   #HYPHEN-MINUS
0x2e   0x002e   #FULL STOP
0x2f   0x002f   #SOLIDUS
0x30   0x0030   #DIGIT ZERO
0x31   0x0031   #DIGIT ONE
0x32   0x0032   #DIGIT TWO
0x33   0x0033   #DIGIT THREE
0x34   0x0034   #DIGIT FOUR
0x35   0x0035   #DIGIT FIVE
0x36   0x0036   #DIGIT SIX
0x37   0x0037   #DIGIT SEVEN
0x38   0x0038   #DIGIT EIGHT
0x39   0x0039   #DIGIT NINE
0x3a   0x003a   #COLON
0x3b   0x003b   #SEMICOLON
0x3c   0x003c   #LESS-THAN SIGN
0x3d   0x003d   #EQUALS SIGN
0x3e   0x003e   #GREATER-THAN SIGN
0x3f   0x003f   #QUESTION MARK
0x40   0x0040   #COMMERCIAL AT
0x41   0x0041   #LATIN CAPITAL LETTER A
0x42   0x0042   #LATIN CAPITAL LETTER B
0x43   0x0043   #LATIN CAPITAL LETTER C
0x44   0x0044   #LATIN CAPITAL LETTER D
0x45   0x0045   #LATIN CAPITAL LETTER E
0x46   0x0046   #LATIN CAPITAL LETTER F
0x47   0x0047   #LATIN CAPITAL LETTER G
0x48   0x0048   #LATIN CAPITAL LETTER H
0x49   0x0049   #LATIN CAPITAL LETTER I
0x4a   0x004a   #LATIN CAPITAL LETTER J
0x4b   0x004b   #LATIN CAPITAL LETTER K
0x4c   0x004c   #LATIN CAPITAL LETTER L
0x4d   0x004d   #LATIN CAPITAL LETTER M
0x4e   0x004e   #LATIN CAPITAL LETTER N
0x4f   0x004f   #LATIN CAPITAL LETTER O
0x50   0x0050   #LATIN CAPITAL LETTER P
0x51   0x0051   #LATIN CAPITAL LETTER Q
0x52   0x0052   #LATIN CAPITAL LETTER R
0x53   0x0053   #LATIN CAPITAL LETTER S
0x54   0x0054   #LATIN CAPITAL LETTER T
0x55   0x0055   #LATIN CAPITAL LETTER U
0x56   0x0056   #LATIN CAPITAL LETTER V
0x57   0x0057   #LATIN CAPITAL LETTER W
0x58   0x0058   #LATIN CAPITAL LETTER X
0x59   0x0059   #LATIN CAPITAL LETTER Y
0x5a   0x005a   #LATIN CAPITAL LETTER Z
0x5b   0x005b   #LEFT SQUARE BRACKET
0x5c   0x005c   #REVERSE SOLIDUS
0x5d   0x005d   #RIGHT SQUARE BRACKET
0x5e   0x005e   #CIRCUMFLEX ACCENT
0x5f   0x005f   #LOW LINE
0x60   0x0060   #GRAVE ACCENT
0x61   0x0061   #LATIN SMALL LETTER A
0x62   0x0062   #LATIN SMALL LETTER B
0x63   0x0063   #LATIN SMALL LETTER C
0x64   0x0064   #LATIN SMALL LETTER D
0x65   0x0065   #LATIN SMALL LETTER E
0x66   0x0066   #LATIN SMALL LETTER F
0x67   0x0067   #LATIN SMALL LETTER G
0x68   0x0068   #LATIN SMALL LETTER H
0x69   0x0069   #LATIN SMALL LETTER I
0x6a   0x006a   #LATIN SMALL LETTER J
0x6b   0x006b   #LATIN SMALL LETTER K
0x6c   0x006c   #LATIN SMALL LETTER L
0x6d   0x006d   #LATIN SMALL LETTER M
0x6e   0x006e   #LATIN SMALL LETTER N
0x6f   0x006f   #LATIN SMALL LETTER O
0x70   0x0070   #LATIN SMALL LETTER P
0x71   0x0071   #LATIN SMALL LETTER Q
0x72   0x0072   #LATIN SMALL LETTER R
0x73   0x0073   #LATIN SMALL LETTER S
0x74   0x0074   #LATIN SMALL LETTER T
0x75   0x0075   #LATIN SMALL LETTER U
0x76   0x0076   #LATIN SMALL LETTER V
0x77   0x0077   #LATIN SMALL LETTER W
0x78   0x0078   #LATIN SMALL LETTER X
0x79   0x0079   #LATIN SMALL LETTER Y
0x7a   0x007a   #LATIN SMALL LETTER Z
0x7b   0x007b   #LEFT CURLY BRACKET
0x7c   0x007c   #VERTICAL LINE
0x7d   0x007d   #RIGHT CURLY BRACKET
0x7e   0x007e   #TILDE
0x7f   0x007f   #DELETE
0x80   0x00c7   #LATIN CAPITAL LETTER C WITH CEDILLA
0x81   0x00fc   #LATIN SMALL LETTER U WITH DIAERESIS
0x82   0x00e9   #LATIN SMALL LETTER E WITH ACUTE
0x83   0x00e2   #LATIN SMALL LETTER A WITH CIRCUMFLEX
0x84   0x00e4   #LATIN SMALL LETTER A WITH DIAERESIS
0x85   0x00e0   #LATIN SMALL LETTER A WITH GRAVE
0x86   0x00e5   #LATIN SMALL LETTER A WITH RING ABOVE
0x87   0x00e7   #LATIN SMALL LETTER C WITH CEDILLA
0x88   0x00ea   #LATIN SMALL LETTER E WITH CIRCUMFLEX
0x89   0x00eb   #LATIN SMALL LETTER E WITH DIAERESIS
0x8a   0x00e8   #LATIN SMALL LETTER E WITH GRAVE
0x8b   0x00ef   #LATIN SMALL LETTER I WITH DIAERESIS
0x8c   0x00ee   #LATIN SMALL LETTER I WITH CIRCUMFLEX
0x8d   0x00ec   #LATIN SMALL LETTER I WITH GRAVE
0x8e   0x00c4   #LATIN CAPITAL LETTER A WITH DIAERESIS
0x8f   0x00c5   #LATIN CAPITAL LETTER A WITH RING ABOVE
0x90   0x00c9   #LATIN CAPITAL LETTER E WITH ACUTE
0x91   0x00e6   #LATIN SMALL LIGATURE AE
0x92   0x00c6   #LATIN CAPITAL LIGATURE AE
0x93   0x00f4   #LATIN SMALL LETTER O WITH CIRCUMFLEX
0x94   0x00f6   #LATIN SMALL LETTER O WITH DIAERESIS
0x95   0x00f2   #LATIN SMALL LETTER O WITH GRAVE
0x96   0x00fb   #LATIN SMALL LETTER U WITH CIRCUMFLEX
0x97   0x00f9   #LATIN SMALL LETTER U WITH GRAVE
0x98   0x00ff   #LATIN SMALL LETTER Y WITH DIAERESIS
0x99   0x00d6   #LATIN CAPITAL LETTER O WITH DIAERESIS
0x9a   0x00dc   #LATIN CAPITAL LETTER U WITH DIAERESIS
0x9b   0x00f8   #LATIN SMALL LETTER O WITH STROKE
0x9c   0x00a3   #POUND SIGN
0x9d   0x00d8   #LATIN CAPITAL LETTER O WITH STROKE
0x9e   0x00d7   #MULTIPLICATION SIGN
0x9f   0x0192   #LATIN SMALL LETTER F WITH HOOK
0xa0   0x00e1   #LATIN SMALL LETTER A WITH ACUTE
0xa1   0x00ed   #LATIN SMALL LETTER I WITH ACUTE
0xa2   0x00f3   #LATIN SMALL LETTER O WITH ACUTE
0xa3   0x00fa   #LATIN SMALL LETTER U WITH ACUTE
0xa4   0x00f1   #LATIN SMALL LETTER N WITH TILDE
0xa5   0x00d1   #LATIN CAPITAL LETTER N WITH TILDE
0xa6   0x00aa   #FEMININE ORDINAL INDICATOR
0xa7   0x00ba   #MASCULINE ORDINAL INDICATOR
0xa8   0x00bf   #INVERTED QUESTION MARK
0xa9   0x00ae   #REGISTERED SIGN
0xaa   0x00ac   #NOT SIGN
0xab   0x00bd   #VULGAR FRACTION ONE HALF
0xac   0x00bc   #VULGAR FRACTION ONE QUARTER
0xad   0x00a1   #INVERTED EXCLAMATION MARK
0xae   0x00ab   #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
0xaf   0x00bb   #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
0xb0   0x2591   #LIGHT SHADE
0xb1   0x2592   #MEDIUM SHADE
0xb2   0x2593   #DARK SHADE
0xb3   0x2502   #BOX DRAWINGS LIGHT VERTICAL
0xb4   0x2524   #BOX DRAWINGS LIGHT VERTICAL AND LEFT
0xb5   0x00c1   #LATIN CAPITAL LETTER A WITH ACUTE
0xb6   0x00c2   #LATIN CAPITAL LETTER A WITH CIRCUMFLEX
0xb7   0x00c0   #LATIN CAPITAL LETTER A WITH GRAVE
0xb8   0x00a9   #COPYRIGHT SIGN
0xb9   0x2563   #BOX DRAWINGS DOUBLE VERTICAL AND LEFT
0xba   0x2551   #BOX DRAWINGS DOUBLE VERTICAL
0xbb   0x2557   #BOX DRAWINGS DOUBLE DOWN AND LEFT
0xbc   0x255d   #BOX DRAWINGS DOUBLE UP AND LEFT
0xbd   0x00a2   #CENT SIGN
0xbe   0x00a5   #YEN SIGN
0xbf   0x2510   #BOX DRAWINGS LIGHT DOWN AND LEFT
0xc0   0x2514   #BOX DRAWINGS LIGHT UP AND RIGHT
0xc1   0x2534   #BOX DRAWINGS LIGHT UP AND HORIZONTAL
0xc2   0x252c   #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
0xc3   0x251c   #BOX DRAWINGS LIGHT VERTICAL AND RIGHT
0xc4   0x2500   #BOX DRAWINGS LIGHT HORIZONTAL
0xc5   0x253c   #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
0xc6   0x00e3   #LATIN SMALL LETTER A WITH TILDE
0xc7   0x00c3   #LATIN CAPITAL LETTER A WITH TILDE
0xc8   0x255a   #BOX DRAWINGS DOUBLE UP AND RIGHT
0xc9   0x2554   #BOX DRAWINGS DOUBLE DOWN AND RIGHT
0xca   0x2569   #BOX DRAWINGS DOUBLE UP AND HORIZONTAL
0xcb   0x2566   #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL
0xcc   0x2560   #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
0xcd   0x2550   #BOX DRAWINGS DOUBLE HORIZONTAL
0xce   0x256c   #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL
0xcf   0x00a4   #CURRENCY SIGN
0xd0   0x00f0   #LATIN SMALL LETTER ETH
0xd1   0x00d0   #LATIN CAPITAL LETTER ETH
0xd2   0x00ca   #LATIN CAPITAL LETTER E WITH CIRCUMFLEX
0xd3   0x00cb   #LATIN CAPITAL LETTER E WITH DIAERESIS
0xd4   0x00c8   #LATIN CAPITAL LETTER E WITH GRAVE
0xd5   0x0131   #LATIN SMALL LETTER DOTLESS I
0xd6   0x00cd   #LATIN CAPITAL LETTER I WITH ACUTE
0xd7   0x00ce   #LATIN CAPITAL LETTER I WITH CIRCUMFLEX
0xd8   0x00cf   #LATIN CAPITAL LETTER I WITH DIAERESIS
0xd9   0x2518   #BOX DRAWINGS LIGHT UP AND LEFT
0xda   0x250c   #BOX DRAWINGS LIGHT DOWN AND RIGHT
0xdb   0x2588   #FULL BLOCK
0xdc   0x2584   #LOWER HALF BLOCK
0xdd   0x00a6   #BROKEN BAR
0xde   0x00cc   #LATIN CAPITAL LETTER I WITH GRAVE
0xdf   0x2580   #UPPER HALF BLOCK
0xe0   0x00d3   #LATIN CAPITAL LETTER O WITH ACUTE
0xe1   0x00df   #LATIN SMALL LETTER SHARP S
0xe2   0x00d4   #LATIN CAPITAL LETTER O WITH CIRCUMFLEX
0xe3   0x00d2   #LATIN CAPITAL LETTER O WITH GRAVE
0xe4   0x00f5   #LATIN SMALL LETTER O WITH TILDE
0xe5   0x00d5   #LATIN CAPITAL LETTER O WITH TILDE
0xe6   0x00b5   #MICRO SIGN
0xe7   0x00fe   #LATIN SMALL LETTER THORN
0xe8   0x00de   #LATIN CAPITAL LETTER THORN
0xe9   0x00da   #LATIN CAPITAL LETTER U WITH ACUTE
0xea   0x00db   #LATIN CAPITAL LETTER U WITH CIRCUMFLEX
0xeb   0x00d9   #LATIN CAPITAL LETTER U WITH GRAVE
0xec   0x00fd   #LATIN SMALL LETTER Y WITH ACUTE
0xed   0x00dd   #LATIN CAPITAL LETTER Y WITH ACUTE
0xee   0x00af   #MACRON
0xef   0x00b4   #ACUTE ACCENT
0xf0   0x00ad   #SOFT HYPHEN
0xf1   0x00b1   #PLUS-MINUS SIGN
0xf2   0x2017   #DOUBLE LOW LINE
0xf3   0x00be   #VULGAR FRACTION THREE QUARTERS
0xf4   0x00b6   #PILCROW SIGN
0xf5   0x00a7   #SECTION SIGN
0xf6   0x00f7   #DIVISION SIGN
0xf7   0x00b8   #CEDILLA
0xf8   0x00b0   #DEGREE SIGN
0xf9   0x00a8   #DIAERESIS
0xfa   0x00b7   #MIDDLE DOT
0xfb   0x00b9   #SUPERSCRIPT ONE
0xfc   0x00b3   #SUPERSCRIPT THREE
0xfd   0x00b2   #SUPERSCRIPT TWO
0xfe   0x25a0   #BLACK SQUARE
0xff   0x00a0   #NO-BREAK SPACE
]]

local cp850ToUtf8ConvertTbl
local utf8ToCp850ConvertTbl

local function cp850Init()
	cp850ToUtf8ConvertTbl = {}
	utf8ToCp850ConvertTbl = {}
	codepageTableFill("cp850", cp850ConvertStr, cp850ToUtf8ConvertTbl, utf8ToCp850ConvertTbl)
end

function utf.cp850ToUtf8(str)
	if not cp850ToUtf8ConvertTbl then
		cp850Init()
	end
	-- http://en.wikipedia.org/wiki/UTF-8
	-- http://winbatch.hpdd.de/MyWbtHelp/htm/20100217.HowTo.ConvertBetweenCodepages.htm
	local ret = {}
	local subStr
	for i = 1, #str do
		subStr = str:sub(i, i)
		if cp850ToUtf8ConvertTbl[subStr] then
			ret[#ret + 1] = cp850ToUtf8ConvertTbl[subStr].utf8
		else
			ret[#ret + 1] = char(26) -- utf.latin9ToUtf8(subStr)
		end
	end
	return concat(ret)
end

local errorPrint = false
function utf.utf8ToCp850(str)
	if errorPrint then
		return str
	end
	if not cp850ToUtf8ConvertTbl then
		cp850Init()
	end
	-- http://en.wikipedia.org/wiki/UTF-8
	-- http://winbatch.hpdd.de/MyWbtHelp/htm/20100217.HowTo.ConvertBetweenCodepages.htm
	-- http://en.wikipedia.org/wiki/UTF-8
	-- http://winbatch.hpdd.de/MyWbtHelp/htm/20100217.HowTo.ConvertBetweenCodepages.htm
	local ret = {}
	local subStr
	local prevPos = 1
	for i, valid in utf.iter(str, prevPos) do
		subStr = str:sub(prevPos, i - 1)
		if valid then
			if utf8ToCp850ConvertTbl[subStr] then
				ret[#ret + 1] = utf8ToCp850ConvertTbl[subStr].cp
			else
				ret[#ret + 1] = char(26) -- utf.utf8ToLatin9(subStr) -- convert multibyte to single-byte
			end
		else
			local bytes = {}
			for j = 1, subStr:len() do
				bytes[j] = subStr:byte(j)
			end
			bytes = table.concat(bytes, ", ")
			errorPrint = true
			print(string.format("utf8ToCp850, invalid char in position %d - %d: '%s', bytes: %s", prevPos, i - 1, subStr, bytes))
			errorPrint = false
		end
		prevPos = i
	end
	return concat(ret)
end

ffi.cdef [[
	int MultiByteToWideChar(unsigned int CodePage, unsigned long dwFlags, const char* lpMultiByteStr, int cbMultiByte, wchar_t* lpWideCharStr, int cchWideChar);
		int WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, const wchar_t* lpWideCharStr, int cchWideChar, char* lpMultiByteStr, int cchMultiByte, const char* lpDefaultChar, int* pfUsedDefaultChar);
]]

local CP_UTF8 = 65001
local CP_ACP = 0

if ffi.os == "Windows" then -- can't use util, loop load
	function utf.u2w(input)
		local wlen = C.MultiByteToWideChar(CP_UTF8, 0, input, #input, nil, 0)
		local wstr = ffi.new('wchar_t[?]', wlen + 1)
		C.MultiByteToWideChar(CP_UTF8, 0, input, #input, wstr, wlen)
		return wstr, wlen
	end

	function utf.a2w(input)
		local wlen = C.MultiByteToWideChar(CP_ACP, 0, input, #input, nil, 0)
		local wstr = ffi.new('wchar_t[?]', wlen + 1)
		C.MultiByteToWideChar(CP_ACP, 0, input, #input, wstr, wlen)
		return wstr, wlen
	end

	function utf.w2u(wstr, wlen)
		if ffi.isNull(wstr) then
			return ""
		end
		local len = C.WideCharToMultiByte(CP_UTF8, 0, wstr, wlen or -1, nil, 0, nil, nil) -- wlen or -1 for null terminated string
		local str = ffi.new('char[?]', len + 1)
		C.WideCharToMultiByte(CP_UTF8, 0, wstr, wlen or -1, str, len, nil, nil)
		return ffi.string(str)
	end

	function utf.w2a(wstr, wlen)
		if ffi.isNull(wstr) then
			return ""
		end
		local len = C.WideCharToMultiByte(CP_ACP, 0, wstr, wlen or -1, nil, 0, nil, nil)
		local str = ffi.new('char[?]', len + 1)
		C.WideCharToMultiByte(CP_ACP, 0, wstr, wlen or -1, str, len, nil, nil)
		return ffi.string(str)
	end
else
	function utf.a2w(input)
		local wstr = ffi.new('wchar_t[?]', #input + 1)
		local wlen = C.mbstowcs(wstr, input, #input)
		return wstr, wlen
	end

	function utf.w2a(wstr, wlen)
		if ffi.isNull(wstr) then
			return ""
		end
		wlen = wlen or C.wcslen(wstr)
		local str = ffi.new('char[?]', wlen + 1) -- * 2 + 1)
		C.wcstombs(str, wstr, wlen)
		return ffi.string(str)
	end
end

utf.u2a = function(input)
	return utf.w2a(utf.u2w(input))
end

utf.a2u = function(input)
	return utf.w2u(utf.a2w(input))
end

return utf
