--- lib/scanner/hocr.lua
-- https://docs.google.com/document/d/1QQnIQtvdAC_8n92-LhwPcjtAUFwBlzE8EWnKAxlgVf0/preview
-- see: https://github.com/ryanfb/HocrConverter/blob/master/HocrConverter.py
-- see: https://github.com/tmbdev/hocr-tools/blob/master/hocr-pdf
-- https://github.com/eloops/hocr2pdf
package.path = "lib/?.lua;lib/?.lx;" .. package.path
package.path = "../lib/?.lua;../lib/?.lx;" .. package.path
package.path = "../../lib/?.lua;../../lib/?.lx;" .. package.path
require "start"

local hocr = {}

local xmlua = require "xmlua"
local util = require "util"
local fs = require "fs"
local peg = require "peg"
local draw = require "draw/draw"
local print = util.print
-- local re =  require "re"

local cd = peg.capture(peg.least(peg.define.digit, 1)) -- capture at least one digit
local ws = peg.least(peg.define.anyWhiteSpace, 1)

local function getText(node) -- a function for extracting the text from a node
	local textnodes = node:search(".//text()") -- node.xpathEval(".//text()")
	-- local ret = textnodes:content()
	-- local s = string.join([node.getContent() for node in textnodes])
	-- return re.sub('\s+',' ',s)
	local ret = {}
	for i, nodeLine in ipairs(textnodes) do
		if i == 1 or i == #textnodes then
			ret[i] = peg.replace(nodeLine:content(), ws, "") -- no space at the beginning nor at the end of line
		else
			ret[i] = peg.replace(nodeLine:content(), ws, " ")
		end
	end
	return table.concat(ret)
end

local bboxre = peg.toPattern(peg.pattern "bbox" * ws * cd * ws * cd * ws * cd * ws * cd)
local function getBbox(node) -- a function for extracting the bbox property from a node
	-- note that the title= attribute on a node with an ocr_ class must conform with the OCR spec
	local data = node:get_attribute('title')
	-- local bboxre = re.compile(r'\bbbox\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)')
	-- local bboxre = "bbox%s+(%d+)%s+(%d+)%s+(%d+)%s+(%d+)"
	-- local ret = peg.match(bboxre / getBboxPart, data)
	local p1, p2, p3, p4 = peg.match(bboxre, data) -- data:match(bboxre)
	return p1, p2, p3, p4
	-- return [int(x) for x in bboxre.search(data).groups()] -- The .groups() method returns a tuple containing the strings for all the subgroups, from 1 up to however many there are.

end

function hocr.toHtml(html, option)
	if html == nil then
		local filePath = util.mainPath() .. "test/lib/scanner/hOCR.xhtml"
		html = fs.readFile(filePath)
	end
	local document = xmlua.XML.parse(html)
	-- search all nodes having a class of ocr_line
	local lines = document:search("//*[@class='ocr_line']")
	-- this extracts all the bounding boxes and the text they contain
	-- it doesn't matter what other markup the line node may contain
	local ret = {}
	local retTxt
	if option == "print" then
		retTxt = {"row\tleft\ttop\tright\tbottom\ttext\n"}
	end
	local p1, p2, p3, p4, text
	local i = 0
	for _, line in ipairs(lines) do
		p1, p2, p3, p4 = getBbox(line)
		text = getText(line)
		if peg.replace(text, " ", "") ~= "" then
			i = i + 1
			ret[i] = {left = p1, top = p2, right = p3, bottom = p4, text = text}
			if option == "print" then
				retTxt[i + 1] = string.format("%d.\t%s\t%s\t%s\t%s\t%s\n", i, p1, p2, p3, p4, text)
			end
		end
	end
	return ret, retTxt and table.concat(retTxt)
end

function hocr.toPdf(ohcrTbl, pictFileName, pictSize)
	local docType = "pdf" -- svg, pdf, png
	-- local fileName = "scan_hocr_pdf.pdf"
	local printFileName = peg.parseBeforeLast(pictFileName, ".") .. "." .. docType
	local paperType = "A4" -- "A4 landscape"
	if pictSize.width > pictSize.height then
		paperType = "A4"
	end
	local paper = {
		border_left = 0, -- 20 normal, but this is mainly for the web, no marginals here
		border_top = 0,
		border_right = 0,
		border_bottom = 0
	}
	local dotsPerInch = 72 -- DPI, always on pdf
	paper = draw.setPaper(paper, paperType, docType, dotsPerInch)
	local pref = {
		use_hpdf = true,
		file_name = printFileName,
		document_type = docType,
		paper = paper,
		-- style = {},
		content = {
			{
				id = "scan",
				type = "pict",
				x = 0,
				y = 0,
				width = paper.width, -- paper.width, pictSize.width
				height = paper.height, -- paper.height, pictSize.height
				source = pictFileName -- peg.replace(pictFileName, ".jpg", ".png")
				-- style = style.default_pict,
			}
		}
	}
	local time = util.seconds()
	local pictArr = draw.drawArray(pref)
	time = util.seconds(time)
	print(printFileName .. " generation time: " .. util.seconds_to_clock(time, 8))
	for i, pict in ipairs(pictArr) do
		if pict then
			if type(pict) == "string" then
				-- util.openFile(pict)
			else
				print(i .. ". picture size: " .. #pict)
				util.writeFile(printFileName, pict)
				-- util.openFile(printFileName)
			end
		end
	end
end

if not ... then
	hocr.toHtml()
end

return hocr
