--- lib/scanner/tesseract.lua
-- https://github.com/tess.Tesseract-ocr/tess.Tesseract/wiki/APIExample
-- https://github.com/tesseract-ocr/tesseract/blob/main/include/tesseract/capi.h
-- https://github.com/kba/hocrjs
package.path = "lib/?.lua;lib/?.lx;" .. package.path
package.path = "../lib/?.lua;../lib/?.lx;" .. package.path
package.path = "../../lib/?.lua;../../lib/?.lx;" .. package.path
require "start"

local tesseract = {}

local ffi = require "mffi"
local C = ffi.C
local util = require "util"
local fs = require "fs"
local peg = require "peg"
local json = require "json"
local tessh = require "scanner/tesseract_h" -- todo: nx: must be before require vips
local vips = require "vips"
local draw = require "draw/draw"
local print = util.print
local hocr -- lazy load, require "scanner/hocr"
local tess, lept = tessh.tess, tessh.lept
local datapath, monitor
local prevProgress = -1
local useHocr = false
-- set version in scanner/tesseract_h.lua / tessh.version = 4 or tessh.version = 3
local arg = {...}
if arg[1] then
	util.printTable(arg, "arg")
end
local htmlStart = [[
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <meta content="ocr_line ocr_page" name="ocr-capabilities"/>
    <meta content="en" name="ocr-langs"/>
    <meta content="Latn" name="ocr-scripts"/>
    <meta content="" name="ocr-microformats"/>
    <title>OCR Output</title>
  </head>
	<style>
		body {
			background-image: url("{{fileNameShort}}");
		}
	</style>
  <body>
]]
local htmlEnd = [[
  </body>
	<script src="hocr.fullscreen.js"></script>
</html>
]] -- <script src="https://unpkg.com/hocrjs></script>

local function progressFunction(ths, left, right, top, bottom)
	-- yield() -- must NOT call or will get "attempt to yield across C-call boundary"
	local percentDone = tess.TessMonitorGetProgress(monitor)
	if percentDone > prevProgress and util.fmod(percentDone, 10) == 0 then
		prevProgress = percentDone
		util.printInfo("percent done %d%%", percentDone)
	end
	-- util.printInfo("percent done %d%%, left %d, top %d, right %d, bottom %d", tonumber(percentDone), tonumber(left), tonumber(top), tonumber(right), tonumber(bottom))
end
local progressFunctionPointer = ffi.cast("TessProgressFunc (*)(ETEXT_DESC* ths, int left, int right, int top, int bottom)", progressFunction)

local function pictFileName(fileName)
	if fs.fileExists(fileName) then
		return fileName
	end
	local fileName2 = util.mainPath() .. "test/lib/scanner/" .. (fileName or "scan.jpg") -- scan_por
	if fs.fileExists(fileName2) then
		return fileName2
	end
	fileName2 = util.mainPath() .. "test/lib/scanner/" .. (fileName or "scan.jpeg")
	if fs.fileExists(fileName2) then
		return fileName2
	end
	fileName2 = util.mainPath() .. "test/lib/scanner/" .. (fileName or "scan.png")
	--[[if fs.fileExists(fileName) == false then
		fileName = util.mainPath().."test/lib/scanner/scan.pdf"
	end
	-- convert and save to png or jpef
	]]
	if fs.fileExists(fileName2) == false then
		fileName2 = util.mainPath() .. "test/lib/scanner/" .. (fileName or "scan.jpg")
		util.printWarning("file '%s' does not exist", fileName2)
		return
	end
	return fileName2
end

local function tesseractClear(api)
	if monitor then
		tess.TessMonitorDelete(monitor)
	end
	monitor = nil
	prevProgress = -1
	tess.TessBaseAPIClear(api)
	tess.TessBaseAPIEnd(api)
	tess.TessBaseAPIDelete(api)
end

local function tesseractInit(api, dataVersion, lang)
	if api then
		tesseractClear(api)
	end
	lang = lang or "eng" --- todo: get default lang from prefs
	api = tess.TessBaseAPICreate()
	local ret
	-- dataVersion = 3
	-- tessdata_fast can't handle orientation, we use old tessdata to handle the orientation detection
	if dataVersion == 4 then
		datapath = peg.parseBeforeLast(util.pathBin():sub(1, -2), "/") .. "/bin_data/tessdata_fast"
	else
		datapath = peg.parseBeforeLast(util.pathBin():sub(1, -2), "/") .. "/bin_data/tessdata_old"
	end
	if tessh.version == 3 then
		-- ret = tess.TessBaseAPIInit3(api, datapath, lang)
		ret = tess.TessBaseAPIInit4(api, datapath, lang, C.OEM_DEFAULT, nil, 0, nil, nil, 0, C.TRUE)
	else
		if dataVersion == 4 then
			ret = tess.TessBaseAPIInit4(api, datapath, lang, C.OEM_LSTM_ONLY, nil, 0, nil, nil, 0, C.TRUE)
		else
			ret = tess.TessBaseAPIInit4(api, datapath, lang, C.OEM_TESSERACT_ONLY, nil, 0, nil, nil, 0, C.TRUE)
		end
	end
	if ret ~= 0 then
		util.printWarning("Error initialising Tesseract, data version: " .. dataVersion)
		tesseractClear(api)
		return nil
	end
	if util.isWin() == false then
		monitor = tess.TessMonitorCreate()
		tess.TessMonitorSetProgressFunc(monitor, progressFunctionPointer)
	end
	return api
end

function tesseract.run(fileName, lang)
	local api = nil
	local time = util.seconds()
	local versionTesseract = ffi.string(tess.TessVersion())
	local versionLeptonica = ffi.string(lept.getLeptonicaVersion())
	util.printOk("Tesseract version: %s, Leptonica version: %s, using language: %s", versionTesseract, versionLeptonica, lang)

	local outputbase = util.mainPath() .. "test/lib/scanner/scanout"
	if util.isWine() then
		outputbase = "Z:" .. outputbase
		-- elseif util.isWin() then
		-- datapath = "C:/Program Files (x86)/Tesseract-OCR/tessdata"
	end
	local img = ffi.newNoAnchor("PIX*[1]")
	img[0] = lept.pixRead(fileName)
	if ffi.isNull(img[0]) then
		util.printWarning("Error reading image '%s'", fileName)
		return
	end

	--[=[
	api = tesseractInit(nil, 3, lang) -- detect orientation with old tesseract 3 data
	if api == nil then
		return
	end
	tess.TessBaseAPISetImage2(api, img[0])
	local orientationTime = util.seconds()
	util.printInfo("* Tesseract recognizing image orientation '%s'", fileName)
	local orientation_degree = ffi.new("int[1]")
	local orientation_confidence = ffi.new("float[1]")
	local best_script_name = ffi.new("const char*[1]")
	local script_confidence = ffi.new("float[1]")
	--[[
		 * Detect the orientation of the input image and apparent script (alphabet).
		 * orient_deg is the detected clockwise rotation of the input image in degrees
		 * (0, 90, 180, 270)
		 * orient_conf is the confidence (15.0 is reasonably confident)
		 * script_name is an ASCII string, the name of the script, e.g. "Latin"
		 * script_conf is confidence level in the script
		 * Returns true on success and writes values to each parameter as an output
	]]
	local ret =
		tess.TessBaseAPIDetectOrientationScript(
		api,
		orientation_degree,
		orientation_confidence,
		best_script_name,
		script_confidence
	)
	if ret ~= C.TRUE then
		util.printWarning("Error in TessBaseAPIDetectOrientationScript")
		return
	end
	orientationTime = util.seconds(orientationTime)
	print("best orientation degree: "..orientation_degree[0])
	print("script confidence: "..script_confidence[0])
	print("orientation confidence: "..orientation_confidence[0])
	if ffi.isNotNull(best_script_name[0]) then
		print("best script name: "..ffi.string(best_script_name[0]))
	end
	--]=]

	api = tesseractInit(api, 4, lang) -- recognition with new tesseract 4 data
	if api == nil then
		return
	end
	tess.TessBaseAPISetImage2(api, img[0])
	tess.TessBaseAPISetSourceResolution(api, 600)

	util.printInfo("* Tesseract detect image orientation from file '%s'...", fileName)
	local orientation = ffi.new("TessOrientation[1]")
	local writing_direction = ffi.new("TessWritingDirection[1]")
	local textline_order = ffi.new("TessTextlineOrder[1]")
	local deskew_angle = ffi.new("float[1]")
	-- Find as much text as possible in no particular order with orientation and script detection.
	-- local defaultPageSegMode = tess.TessBaseAPIGetPageSegMode(api)
	tess.TessBaseAPISetPageSegMode(api, C.PSM_SPARSE_TEXT_OSD) -- C.PSM_SPARSE_TEXT_OSD == 12, see: http://tess4j.sourceforge.net/docs/docs-1.2/net/sourceforge/tess4j/TessAPI.TessPageSegMode.html
	local orientationTime = util.seconds()
	local pageIterator = tess.TessBaseAPIAnalyseLayout(api)
	tess.TessPageIteratorOrientation(pageIterator, orientation, writing_direction, textline_order, deskew_angle)
	orientationTime = util.seconds(orientationTime)
	print(" - orientation      : " .. tonumber(orientation[0]))
	print(" - writing direction: " .. tonumber(writing_direction[0]))
	print(" - textline order   : " .. tonumber(textline_order[0]))
	print(" - deskew angle     : " .. tonumber(deskew_angle[0]))
	util.printOk("* Tesseract detect orientation time %.1f", orientationTime)

	local fixingOrientationTime = 0
	local image = vips.Image.new_from_file(fileName)
	if orientation[0] ~= C.ORIENTATION_PAGE_UP then
		fixingOrientationTime = util.seconds()
		lept.pixDestroy(img)
		if orientation[0] == C.ORIENTATION_PAGE_RIGHT then
			util.printInfo("* Fixing orientation from right...")
			image = image:rot270()
		elseif orientation[0] == C.ORIENTATION_PAGE_DOWN then
			util.printInfo("* Fixing orientation from down...")
			image = image:rot180()
		elseif orientation[0] == C.ORIENTATION_PAGE_LEFT then
			util.printInfo("* Fixing orientation from left...")
			image = image:rot90()
		end
		local ext = peg.parseLast(fileName, ".")
		local name = peg.parseBeforeLast(fileName, ".")
		fileName = name .. "_rotated." .. ext
		image:write_to_file(fileName)
		-- img = ffi.newNoAnchor("PIX*[1]")
		img[0] = lept.pixRead(fileName)
		if ffi.isNull(img[0]) then
			util.printWarning("Error reading image '%s'", fileName)
			lept.pixDestroy(img)
			tesseractClear(api)
			return
		end
		tess.TessBaseAPISetImage2(api, img[0]) -- re-set the image
		tess.TessBaseAPISetSourceResolution(api, 600)
		fixingOrientationTime = util.seconds(fixingOrientationTime)
		util.printOk("* Tesseract fix orientation time: %.1f", fixingOrientationTime)
	end
	util.printInfo("* Creating rotated A4 picture...")
	local createRotatedSmallPictTime = util.seconds()
	local maxWidth, maxHeight = draw.paperWidthHeight("A4", 150) -- 150 dpi
	local width, height = image:width(), image:height()
	if width > height then
		maxWidth, maxHeight = maxHeight, maxWidth -- landscape image
	end
	local scaleW = maxWidth / width
	local scaleH = maxHeight / height
	local scale = scaleW < scaleH and scaleW or scaleH
	local smallFileName = peg.parseBeforeLast(fileName, ".")
	smallFileName = peg.replace(smallFileName, "_rotated", "") .. "_rotated_A4." .. peg.parseLast(fileName, ".")
	local smallPictureSize
	if scale < 1 then
		-- image = image:resize(scale)
		local width = math.floor(scale * width)
		local image = vips.Image.thumbnail(fileName, width)
		smallPictureSize = {width = image:width(), height = image:height()}
		image:write_to_file(smallFileName)
	else
		fs.deleteFile(smallFileName)
		smallPictureSize = {width = width, height = height}
		smallFileName = nil
	end
	createRotatedSmallPictTime = util.seconds(createRotatedSmallPictTime)
	util.printOk("* Creating rotated A4 picture time: %.1f", createRotatedSmallPictTime)

	image = nil -- try to release memory
	collectgarbage()
	collectgarbage()
	collectgarbage()

	-- [[
	if true then
		util.printInfo("* Tesseract recognizing image '%s'", fileName)
		tess.TessBaseAPISetPageSegMode(api, C.PSM_AUTO) -- C.PSM_AUTO_OSD == 1
		local recgTime = util.seconds()
		local ret = tess.TessBaseAPIRecognize(api, monitor)
		if ret ~= 0 then
			util.printWarning("Error in Tesseract recognition")
			return
		end
		recgTime = util.seconds(recgTime)
		util.printOk("* Tesseract recognizing time %.1f", recgTime)
		-- local text = tess.TessBaseAPIGetUTF8Text(api)
		if useHocr then -- or peg.found(option, "hocr") then
			hocr = hocr or require "scanner/hocr"
			local hOCRText = tess.TessBaseAPIGetHOCRText(api, 1) -- 1 is page number
			if ffi.isNotNull(hOCRText) then
				local fileNameShort = peg.parseLast(smallFileName or fileName, "/", "returnWithoutDivider")
				hOCRText = peg.replace(htmlStart, "{{fileNameShort}}", fileNameShort) .. peg.replace(ffi.string(hOCRText), 'image ""', 'image "' .. fileNameShort .. '"') .. htmlEnd
				-- hOCRText = peg.replace(hOCRText , "lang=", "xml:lang=") -- xml
				local txtPath = util.mainPath() .. "test/lib/scanner/hOCR.html"
				fs.writeFile(txtPath, hOCRText)
				local ohctTbl, hOCRTextCoord = hocr.toHtml(hOCRText)
				txtPath = util.mainPath() .. "test/lib/scanner/hOCR.json"
				fs.writeFile(txtPath, json.toJson(ohctTbl))
				-- txtPath = util.mainPath().."test/lib/scanner/hOCR.txt"
				-- fs.writeFile(txtPath, hOCRTextCoord)
				-- fs.openFile(txtPath)
				if false then -- pdf will be too big, hOCR.html is enough
					if smallFileName then
						hocr.toPdf(ohctTbl, smallFileName, smallPictureSize)
					else
						hocr.toPdf(ohctTbl, fileName, smallPictureSize)
					end
				end
			end
		end
	end
	-- ]]

	--[[
		 * Turns images into symbolic text.
		 *
		 * filename can point to a single image, a multi-page TIFF,
		 * or a plain text list of image filenames.
		 *
		 * retry_config is useful for debugging. If not NULL, you can fall
		 * back to an alternate configuration if a page fails for some
		 * reason.
		 *
		 * timeout_millisec terminates processing if any single page
		 * takes too long. Set to 0 for unlimited time.
		 *
		 * renderer is responible for creating the output. For example,
		 * use the TessTextRenderer if you want plaintext output, or
		 * the TessPDFRender to produce searchable PDF.
		 *
		 * If tessedit_page_number is non-negative, will only process that
		 * single page. Works for multi-page tiff file, or filelist.
		 *
		 * Returns true if successful, false on error.
	]]
	util.printInfo("* Tesseract is creating searchable pdf for image '%s'", fileName)
	-- local filename = img[0]
	local outputPdf = outputbase .. ".pdf"
	fs.deleteFile(outputPdf, nil, false, "no-warning")
	local recocnitionTime = util.seconds()
	tess.TessBaseAPISetPageSegMode(api, C.PSM_AUTO) -- C.PSM_AUTO_OSD == 1
	local textonly = C.FALSE -- TRUE
	local renderer = tess.TessPDFRendererCreate(outputbase, datapath, textonly)
	local retryConfig = nil
	local timeoutMillisec = 0 -- 60 * 1000
	local ret = tess.TessBaseAPIProcessPages(api, fileName, retryConfig, timeoutMillisec, renderer)
	local text = tess.TessBaseAPIGetUTF8Text(api)
	-- local text2 = tess.TessBaseAPIGetHOCRText(api, 1)
	recocnitionTime = util.seconds(recocnitionTime)
	local textLua
	if ffi.isNotNull(text) then
		textLua = ffi.string(text)
	end
	--[[
	local textLuaHcr
	if ffi.isNotNull(text2) then
		textLuaHcr = ffi.string(text2)
		print(textLuaHcr)
	end
	]]
	lept.pixDestroy(img)
	tess.TessDeleteText(text)
	tesseractClear(api)
	if ffi.isNull(text) then
		util.printWarning("Error getting Tesseract text")
		return
	end
	if ret ~= C.TRUE then
		util.printWarning("Error in TessBaseAPIProcessPages")
		return
	end

	util.printOk("* Tesseract version %s, total time %.1f, detect orientation time %.1f, fix orientation time %.1f, recognition time %.1f seconds\n", versionTesseract, util.seconds(time), orientationTime, fixingOrientationTime, recocnitionTime)
	-- util.printOk("* Tesseract version %s end, total time %.1f, recocnition time %.1f seconds\n", versionTesseract, util.seconds(time), recocnitionTime)
	return textLua, outputPdf -- , textLuaHcr
end

function tesseract.debug(option, fileName, lang)
	local txt, txtPath
	fileName = pictFileName(fileName)
	if fileName then
		if peg.found(option, "barcode") then
			local retJson, retJsonArr
			if util.isMac() or util.isLinux() then
				-- try quirc QR_CODE reader - recognizes only QR-Code!
				local quirc = require "barcode/quirc"
				local barcodeTime1 = util.seconds()
				local barcodeCount
				retJsonArr = quirc.decodeFile(fileName)
				barcodeTime1 = util.seconds(barcodeTime1)
				util.printInfo("* quirc barcode recognition time: %.2f seconds, count: %d, result:\n%s", barcodeTime1, #retJsonArr, json.toJson(retJsonArr))
				for _, ret in ipairs(retJsonArr) do
					if ret.barcode then
						if peg.startsWith(ret.barcode, "http") then
							util.openUrl(ret.barcode)
						end
						retJson = ret
					end
				end
			end
			if peg.endsWith(fileName, ".png") then
				-- try zxing QR_CODE
				local zxing = require "barcode/zxing"
				local barcodeTime2 = util.seconds()
				local fastMode, tryRotate, format = false, true, "" -- for example: "" = all, CODE_128, DATA_MATRIX, QR_CODE, EAN_13, EAN_8 - but 1D codes are not well recognized
				retJsonArr = zxing.decodePngFile(fastMode, tryRotate, format, fileName)
				barcodeTime2 = util.seconds(barcodeTime2)
				util.printInfo("* ZXing barcode recognition time: %.2f seconds, result:\n%s", barcodeTime2, json.toJson(retJsonArr))
				if retJsonArr then
					for _, ret in ipairs(retJsonArr) do
						if ret.barcode then
							if (retJson == nil or retJson.barcode ~= ret.barcode) and peg.startsWith(ret.barcode, "http") then
								util.openUrl(ret.barcode)
							end
							retJson = ret
						end
					end
				end
			end
			if true then -- retJson == nil or retJson.barcode == nil then
				-- try also zbar
				util.printInfo("* Recognizing barcodes from image '%s'", fileName)
				local zbar = require "barcode/zbar"

				local barcodeTime = util.seconds()
				local err
				retJsonArr = zbar.scanImage(fileName, {"CODE_128", "DATA_MATRIX", "QR_CODE"})
				barcodeTime = util.seconds(barcodeTime)
				util.printInfo("* Zbar barcode recognition time %.2f seconds, result:\n%s", barcodeTime, json.toJson(retJsonArr))
				for _, ret in ipairs(retJsonArr) do
					if ret.barcode then
						if (retJson == nil or retJson.barcode ~= ret.barcode) and peg.startsWith(ret.barcode, "http") then
							util.openUrl(ret.barcode)
						end
						retJson = ret
					end
				end
				--[[
				txtPath = util.mainPath().."test/lib/scanner/barcode.json"
				fs.writeFile(txtPath, txt)
				fs.openFile(txtPath)
				]]
			end

		end
		local outputPdf
		txt, outputPdf = tesseract.run(fileName, lang)
		-- print("tesseract result text:\n"..txt)
		if txt then
			txtPath = util.mainPath() .. "test/lib/scanner/scanout.txt"
			fs.writeFile(txtPath, txt)
			-- fs.openFile(txtPath)
			-- util.print("open "..outputPdf)
			-- fs.openFile(outputPdf, 1)
			local cmd = "lj " .. util.mainPath() .. "lib/scanner/tesseract_resize_pdf.lua '" .. outputPdf .. "' &"
			util.printInfo("* run: " .. cmd)
			util.runCommandLine(cmd) -- does not work before program ends, library is still writing pdf, need to start backround program
			-- fs.deleteFile(outputPdf, nil, false, "no-warning")
		end
	end
end

if not ... then
	tesseract.debug("barcode", nil, "fin") -- "por"
elseif arg[1] then
	if arg[2] and #arg[2] == 3 then -- 3 letter lang code
		tesseract.debug("barcode", arg[1], arg[2])
	elseif arg[1] ~= "scanner/tesseract" then
		tesseract.debug("barcode", arg[1], "fin")
	end
end

return tesseract
