--  utf16.lua
local utf16 = {}

local ffi = require "mffi"
local bit = require "bit"
local band = bit.band
local bor = bit.bor
local rshift = bit.rshift
local lshift = bit.lshift
local b1, b2, b3, b4
-- http://stackoverflow.com/questions/6240055/manually-converting-unicode-codepoints-into-utf-8-and-utf-16

function utf16.hexToBytes(hex)
	local H = rshift(hex, 16)
	local L = band(hex, 0xFFFF)
	local c3 = rshift(H, 8)
	local c2 = band(H, 0xFF)
	local c1 = rshift(L, 8)
	local c0 = band(L, 0xFF)
	return c3, c2, c1, c0
end

function utf16.utf16leBytesToHex(c3, c2, c1, c0)
	local hex1 = lshift(c3, 8) + c2
	local hex2 = lshift(c1, 8) + c0
	local hex = lshift(hex1, 16) + hex2
	return hex
end

function utf16.codepointToUtf16leBytes(codepoint)
	-- reverse this:
	-- local codepoint = 0x10000 + lshift(codepoint1 - 0xD800, 10) + (codepoint2 - 0xDC00)

	--[[ http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm
	H = Math.floor((S - 0x10000) / 0x400) + 0xD800;
	L = ((S - 0x10000) % 0x400) + 0xDC00; ]]
	codepoint = codepoint - 0x10000
	-- local L2 = (codepoint % 0x400) + 0xDC00
	local H = math.floor(codepoint / 0x400) + 0xD800
	local L = band(codepoint, 0x3FF) + 0xDC00
	-- local H2 = rshift(codepoint, 10) + 0xD800
	-- assert(H == H2, H.."/"..H2)
	--[[
	local c3,c2,c1,c0
	c3 = rshift(H, 8)
	c2 = band(H, 0xFF)
	c1 = rshift(L, 8)
	c0 = band(L, 0xFF)
	return c3,c2,c1,c0
	]]
	return rshift(H, 8), band(H, 0xFF), rshift(L, 8), band(L, 0xFF)
end

local function utf16leBytesToCodepoint(c3, c2, c1, c0)
	-- http://en.wikipedia.org/wiki/UTF-16
	-- http://www.unicode.org/faq/utf_bom.html
	--[[
		UTF-16 uses 2 or 4 bytes to represent Unicode codepoints. Algorithm:
		U+0000 to U+D7FF uses 2-byte 0000hex to D7FFhex
		U+D800 to U+DFFF are invalid codepoints reserved for 4-byte UTF-16
		U+E000 to U+FFFF uses 2-byte E000hex to FFFFhex
		U+10000 to U+10FFFF uses 4-byte UTF-16 encoded as follows:
		Subtract 10000hex from the codepoint.
		Express result as 20-bit binary.
		Use the pattern 110110xxxxxxxxxx 110111xxxxxxxxxxhex to encode the upper- and lower- 10 bits into two 16-bit words.
	]]
	local codepoint1 = lshift(c1, 8) + c0
	local codepoint2 = lshift(c3, 8) + c2
	if codepoint2 == 0 and codepoint1 <= 0xD7FF then
		return codepoint1, codepoint2
	elseif codepoint2 == 0 and codepoint1 >= 0xE000 and codepoint1 <= 0xFFFF then
		return codepoint1, codepoint2
	end
	--[[ http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm
	N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000;]]
	-- multiply by 0x400 = left shift 10 bytes
	local codepoint = 0x10000 + lshift(codepoint1 - 0xD800, 10) + (codepoint2 - 0xDC00)
	return codepoint, codepoint2
end
utf16.utf16leBytesToCodepoint = utf16leBytesToCodepoint

local function bytesToUtf16Stream(bytes, utf16, j)
	-- local b1 = rshift(bytes, 8)
	-- local b2 = band(bytes, 0xFFFF)
	utf16[j] = band(bytes, 0xFFFF) -- b2
	j = j + 1
	utf16[j] = rshift(bytes, 8) -- b1
	j = j + 1
	return j
end

local function utf8ReplacementChar(utf8, i, j)
	-- Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
	-- http://www.fileformat.info/info/unicode/char/0fffd/index.htm
	utf8[j] = 0xEF
	j = j + 1
	utf8[j] = 0xBF
	j = j + 1
	utf8[j] = 0xBD
	j = j + 1
	return i, j
end

local function codepointToUtf8Stream(codepoint, codepoint1, utf8, i, j) -- , streamLen)
	-- if codepoint < 0 then
	-- return utf8ReplacementChar(utf8, i+4 ,j)
	if codepoint1 == -1 then -- codepoint >= 0x10000 and codepoint <= 0x10FFFF then
		-- return utf8ReplacementChar(utf8, i+2 ,j)
		-- elseif codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF and
		--		 codepoint  >= 0xDC00 and codepoint  <= 0xDFFF then
		-- Unicode surrogate pair (D800-DBFF and DC00-DFFF)
		-- elseif codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF and -- utf16 surrogate
		--		 codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF then
		-- 0xDBFF, 0xDB7F if not custom
		-- Code points U+D800 to U+DFFF and code points U+10000 to U+10FFFF
		-- 'MUSICAL SYMBOL G CLEF' (U+1D11E)
		-- http://www.fileformat.info/info/unicode/char/1D11E/index.htm
		-- UTF-8 (hex)	  0xF0=240 0x9D=157 0x84=132 0x9E=158 = 119070
		-- UTF-16 (hex)	0xD834=55348 0xDD1E=56606
		-- http://www.fileformat.info/info/unicode/char/10000/index.htm
		-- UTF-16 (hex)	0xD800 0xDC00
		-- http://www.fileformat.info/info/unicode/char/10ffff/index.htm
		-- UTF-16 (hex)	0xDBFF 0xDFFF
		local b1 = bor(rshift(codepoint, 18), 0xF0)
		utf8[j] = b1
		j = j + 1
		local b2 = bor(band(rshift(codepoint, 12), 0x3F), 0x80)
		utf8[j] = b2
		j = j + 1
		local b3 = bor(band(rshift(codepoint, 6), 0x3F), 0x80)
		utf8[j] = b3
		j = j + 1
		local b4 = bor(band(codepoint, 0x3F), 0x80)
		utf8[j] = b4
		j = j + 1
		return i + 4, j
	elseif codepoint1 <= 0x07FF then --  U+0000 to U+D7FF
		-- 110xxxxx	10xxxxxx
		-- ¢ = C2=194 A2=162 = U+00A2
		local b1 = bor(rshift(codepoint, 6), 0xC0) -- (codepoint >> 6) | 0xC0)
		utf8[j] = b1
		j = j + 1
		local b2 = bor(band(codepoint, 0x3F), 0x80)
		utf8[j] = b2
		j = j + 1
		return i + 2, j
	elseif codepoint1 >= 0xD800 and codepoint1 <= 0xDFFF then -- surrogates
		return utf8ReplacementChar(utf8, i + 2, j)
	else
		-- 1110xxxx	10xxxxxx	10xxxxxx
		-- € = E2=226 82=130 AC=172 = U+20AC
		-- http://www.fileformat.info/info/unicode/char/20AC/index.htm
		local b1 = bor(rshift(codepoint1, 12), 0xE0)
		utf8[j] = b1
		j = j + 1
		local b2 = bor(band(rshift(codepoint1, 6), 0x3F), 0x80)
		utf8[j] = b2
		j = j + 1
		local b3 = bor(band(codepoint1, 0x3F), 0x80)
		utf8[j] = b3
		j = j + 1
		return i + 2, j
	end
	return utf8ReplacementChar(utf8, i + 4, j)
end

local lsB, msB, codepoint, codepoint1, codepoint2
local function utf16leToUtf8Stream(stream, utf8, i, j) -- , streamLen)
	-- returns the indexes of the next characters in the stream.
	-- UTF16: Variable-width encoding.
	-- Code points U+0000 to U+FFFF take 2 bytes, code points U+10000 to U+10FFFF take 4 bytes.
	-- UTF8: Variable-width encoding, backwards compatible with ASCII.
	-- ASCII characters (U+0000 to U+007F) take 1 byte, code points U+0080 to U+07FF take 2 bytes,
	-- code points U+0800 to U+FFFF take 3 bytes, code points U+10000 to U+10FFFF take 4 bytes.
	-- http://en.wikipedia.org/wiki/UTF-16
	-- http://en.wikipedia.org/wiki/UTF-8
	-- http://www.unicode.org/faq/utf_bom.html
	lsB = stream[i]
	msB = stream[i + 1]
	if msB == 0 and lsB <= 0x7F then -- ascii char 0x7F = 127
		-- 0xxxxxxx
		utf8[j] = lsB
		j = j + 1
		if lsB == 0 then
			if j > 1 then
				return -1, j - 1
			end
			-- return -1,j
		end
		return i + 2, j
	end
	codepoint1 = lshift(msB, 8) + lsB -- msB*256+lsB
	if codepoint1 <= 0x07FF then --  U+0000 to U+D7FF
		-- 110xxxxx	10xxxxxx
		-- ¢ = C2=194 A2=162 = U+00A2
		utf8[j] = bor(rshift(codepoint1, 6), 0xC0) -- (codepoint >> 6) | 0xC0)
		j = j + 1
		utf8[j] = bor(band(codepoint1, 0x3F), 0x80)
		j = j + 1
		return i + 2, j
	end

	codepoint, codepoint2 = utf16leBytesToCodepoint(stream[i + 3], stream[i + 2], msB, lsB)
	if codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF and codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF then
		return codepointToUtf8Stream(codepoint, -1, utf8, i, j)
	elseif codepoint1 >= 0xD800 and codepoint1 <= 0xFFFF then
		return codepointToUtf8Stream(codepoint2, codepoint1, utf8, i, j)
	elseif codepoint > 0x10FFFF then
		i, j = codepointToUtf8Stream(codepoint1, codepoint1, utf8, i, j)
		return codepointToUtf8Stream(codepoint2, codepoint1, utf8, i, j)
	end
	return codepointToUtf8Stream(codepoint, codepoint1, utf8, i, j)
end

function utf16.utf16leToUtf8(utf16, utf16ByteLen, utf8buffer, utf8bufferLen)
	local i, j = 0, 0
	local maxLen = utf16ByteLen * 2
	local utf8
	if utf8bufferLen and utf8bufferLen >= maxLen then
		utf8 = utf8buffer
	else
		utf8 = ffi.newNoAnchor("uint8_t[?]", maxLen)
		-- utf8 is usually smaller, but may be bigger in for ex. 2 -> 3 byte cases
	end
	if utf16ByteLen < 0 then
		utf16ByteLen = math.huge
	end
	while i < utf16ByteLen do
		i, j = utf16leToUtf8Stream(utf16, utf8, i, j, utf16ByteLen)
		if i < 0 then -- prev and next are zero
			-- -1 = break in 2 zeroes or replacement char
			-- -2 = break in replacement char
			break
		end
	end
	-- local utf8str = ffi.string(utf8, j)
	return ffi.string(utf8, j) -- utf8str
end

function utf16.utf8ToCodepoint(stream, i) -- , streamLen)
	--[[
http://stackoverflow.com/questions/6240055/manually-converting-unicode-codepoints-into-utf-8-and-utf-16

		1-byte UTF-8 = 0xxxxxxxbin = 7 bits = 0-7Fhex
		The initial byte of 2-, 3- and 4-byte UTF-8 start with 2, 3 or 4 one bits, followed by a zero bit. Follow on bytes always start with the two-bit pattern 10, leaving 6 bits for data:
		2-byte UTF-8 = 110xxxxx 10xxxxxxbin = 5+6(11) bits = 80-7FFhex
		3-byte UTF-8 = 1110xxxx 10xxxxxx 10xxxxxxbin = 4+6+6(16) bits = 800-FFFFhex
		4-byte UTF-8 = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxxbin = 3+6+6+6(21) bits = 10000-10FFFFhex
		Unicode codepoints are undefined beyond 10FFFFhex.
		http://en.wikipedia.org/wiki/UTF-8
		Character,	Binary code point,	Binary UTF-8,	Hexadecimal UTF-8
	$	U+0024	0100100,	00100100	24
	¢	U+00A2	000 1010 0010,	11000010 10100010	C2 A2
	€	U+20AC	0010 0000 1010 1100,	11100010 10000010 10101100	E2 82 AC
	䭢	U+24B62	00010 0100 1011 0110 0010,	11110000 10100100 10101101 10100010	F0 A4 AD A2
	]]
	b1 = stream[i]
	if b1 <= 0x7F then -- ascii char 0x7F = 127, 1 byte utf8
		return i + 1, b1
	end
	i = i + 1
	--[[ if i > streamLen then
		return i
	end]]
	b2 = stream[i]
	if b1 <= 0xDF then -- 2 byte utf8 110xxxxx
		--  11011111 = 0xDF
		-- 00011111 = 0x1F, 00111111 = 0x3F
		--[[
		local w1 = band(b1, 0x1F) -- null first 3 bits
		local w2 = band(b2, 0x3F) -- null first 2 bits
		local codepoint = lshift(w1 , 6) + w2
		return i+1,codepoint
		]]
		return i + 1, lshift(band(b1, 0x1F), 6) + band(b2, 0x3F)
	end
	i = i + 1
	--[[ if i > streamLen then
		return i
	end]]
	b3 = stream[i]
	if b1 <= 0xEF then -- 3 byte utf8, 1110xxxx
		-- 11101111 = 0xEF
		-- 00001111 = 0x0F
		--[[
		local w1 = band(b1, 0x0F) -- null first 4 bits
		local w2 = band(b2, 0x3F) -- null first 2 bits
		local w3 = band(b3, 0x3F) -- null first 2 bits
		local codepoint = lshift(w1 , 12) + lshift(w2 , 6) + w3
		return i+1,codepoint
		]]
		return i + 1, lshift(band(b1, 0x0F), 12) + lshift(band(b2, 0x3F), 6) + band(b3, 0x3F)
	end
	i = i + 1
	--[[ if i > streamLen then
		return i
	end]]
	b4 = stream[i]
	if b1 <= 0xF7 then -- 4 byte utf8, 11110xxx
		-- 11110111 = 0xF7
		-- 00000111 = 0x07
		--[[
		local w1 = band(b1, 0x07) -- null first 5 bits
		local w2 = band(b2, 0x3F) -- null first 2 bits
		local w3 = band(b3, 0x3F) -- null first 2 bits
		local w4 = band(b4, 0x3F) -- null first 2 bits
		local codepoint = lshift(w1 , 18) + lshift(w2 , 12) + lshift(w3 , 6) + w4
		return i+1,codepoint
		]]
		return i + 1, lshift(band(b1, 0x07), 18) + lshift(band(b2, 0x3F), 12) + lshift(band(b3, 0x3F), 6) + band(b4, 0x3F)
	end
	return -1
end

local function codepointToUtf16le(codepoint, utf16, j)
	-- returns the index of the next character in the utf16 stream
	if codepoint < 0x00 then
		return j, -3
	elseif codepoint >= 0xD800 and codepoint <= 0xDFFF then
		j = bytesToUtf16Stream(0xFFFD, utf16, j) -- 0xFDFF = replcement char
		return j, -2
	elseif codepoint < 0x10000 then
		-- Code points U+0000 to U+D7FF and U+E000 to U+FFFF
		j = bytesToUtf16Stream(codepoint, utf16, j)
		return j
	else
		-- 0x010000 is subtracted from the code point, leaving a 20 bit number in the range 0..0x0FFFFF
		codepoint = codepoint - 0x10000
		-- The top ten bits (a number in the range 0..0x03FF) are added to 0xD800 to give the first code unit or lead surrogate, which will be in the range 0xD800..0xDBFF
		j = bytesToUtf16Stream(rshift(codepoint, 10) + 0xD800, utf16, j)
		j = bytesToUtf16Stream(band(codepoint, 0x3FF) + 0xDC00, utf16, j)
		--[[
		local lead = rshift(codepoint, 10) + 0xD800 -- rshift = top 10 bits
		local trail = band(codepoint, 0x3FF) + 0xDC00 -- band = low 10 bits
		codepointToStream(lead)
		codepointToStream(trail)
		]]
		return j
	end
	j = bytesToUtf16Stream(0xFFFD, utf16, j) -- 0xFFFD = replcement char
	return j, -1
end
utf16.codepointToUtf16le = codepointToUtf16le

local i, j
local err, utf16str
local function utf8ToUtf16le(utf8, utf8ByteLen, utf16buffer, utf16bufferLen)
	i, j = 0, 0
	if utf16bufferLen and utf16bufferLen >= utf8ByteLen then
		utf16str = utf16buffer
	else
		utf16str = ffi.newNoAnchor("uint8_t[?]", utf8ByteLen)
		-- utf16 is usually bigger, but may not be bigger than utf8
	end
	while i < utf8ByteLen do
		-- i,codepoint = utf8ToCodepoint(utf8, i) --, utf8ByteLen)
		b1 = utf8[i]
		if b1 <= 0x7F then -- ascii char 0x7F = 127, 1 byte utf8
			codepoint = b1
			i = i + 1
		else
			i = i + 1
			b2 = utf8[i]
			if b1 <= 0xDF then -- 2 byte utf8 110xxxxx
				codepoint = lshift(band(b1, 0x1F), 6) + band(b2, 0x3F)
				i = i + 1
			else
				i = i + 1
				b3 = utf8[i]
				if b1 <= 0xEF then -- 3 byte utf8, 1110xxxx
					codepoint = lshift(band(b1, 0x0F), 12) + lshift(band(b2, 0x3F), 6) + band(b3, 0x3F)
					i = i + 1
				else
					i = i + 1
					b4 = utf8[i]
					if b1 <= 0xF7 then -- 4 byte utf8, 11110xxx
						codepoint = lshift(band(b1, 0x07), 18) + lshift(band(b2, 0x3F), 12) + lshift(band(b3, 0x3F), 6) + band(b4, 0x3F)
						i = i + 1
					else
						codepoint = nil
						i = -1
					end
				end
			end
		end

		if not codepoint then
			j = bytesToUtf16Stream(0xFFFD, utf16str, j) -- 0xFFFD = replcement char
			break
		end
		j, err = codepointToUtf16le(codepoint, utf16str, j)
		if err then
			break
		end
	end
	return j -- utf16str
end
utf16.utf8ToUtf16le = utf8ToUtf16le

function utf16.toUtf16(str)
	-- if type(str) ~= "string" then
	-- util.printError("type of param is not a string, type: "..type(str), tostring(str))
	-- end
	local utf16bufferLen = #str * 2 + 1
	local utf16buffer = ffi.newAnchor("uint8_t[?]", utf16bufferLen)
	local buffer8 = ffi.cast("uint8_t*", str)
	local len = utf8ToUtf16le(buffer8, #str, utf16buffer, utf16bufferLen)
	return ffi.string(utf16buffer, len) -- utf16str
end

return utf16
