#!/usr/bin/env ruby

require 'hyph-utf8'

$encoding_data_dir = "data/encodings"
# $encodings = ["ec", "qx", "t2a", "lmc", "il2", "il3", "l7x", "t8m", "lth"]
$encodings = ["t8m", "lth"]

$path_root=File.expand_path("../../..")
$output_data_dir = "#{$path_root}/tex/generic/hyph-utf8/conversions"

$encodings.each do |encoding|
	# load encoding
	e = Encoding.new(encoding)

	# open file
	file_out = File.open("#{$output_data_dir}#{File::Separator}conv-utf8-#{encoding}.tex", "w")

	# copyright notice
	file_out.puts "% conv-utf8-#{encoding}.tex"
	file_out.puts "%"
	file_out.puts "% Conversion from UTF-8 to #{encoding.upcase},"
	file_out.puts "% used before loading hyphenation patterns for 8-bit TeX engines."
	file_out.puts "%"
	file_out.puts "% This file is part of hyph-utf8 package and autogenerated."
	file_out.puts "% See http://tug.org/tex-hyphen"
	file_out.puts "%"
	file_out.puts "% Copyright 2008-2013 TeX Users Group."
	file_out.puts "% You may freely use, modify and/or distribute this file."
	file_out.puts "% (But consider adapting the scripts if you need modifications.)"
	file_out.puts

	# macro to get mapping unicode -> font encoding & error message if screwed up
	file_out.puts '% macros adapted from ConTeXt MKII; see unic-ini.mkii'
	file_out.puts '\def\unicodechar#1{%'
	file_out.puts '	\ifcsname unichar@\number#1\endcsname'
	file_out.puts '	  \csname unichar@\number#1\endcsname'
	file_out.puts '	\else'
	file_out.puts '	  \errmessage{Unicode character [#1] not in encoding.}%'
	file_out.puts '	\fi}'

	# minimal and maximal lenght of characters in the encoding (until now just 2 & 3)
	unicode_characters_array = e.unicode_characters.sort
	length_min = unicode_characters_array.first[1].bytes.size
	length_max = unicode_characters_array.last[1].bytes.size

	# only output the necessary macros for transforming UTF-8 -> Unicode number
	if length_min <= 2 and length_max >= 2 then
		file_out.puts '\def\utftwouniglyph#1#2%'
		file_out.puts '	{\expandafter\unicodechar\expandafter'
		file_out.puts '		{\the\numexpr64*(#1-192)+`#2-128\relax}}'
	end
	if length_min <= 3 and length_max >= 3 then
		file_out.puts '\def\utfthreeuniglyph#1#2#3%'
		file_out.puts '	{\expandafter\unicodechar\expandafter'
		file_out.puts '		{\the\numexpr4096*(#1-224)+64*(`#2-128)+`#3-128\relax}}'
	end
	if length_min <= 4 and length_max >= 4 then
		file_out.puts '\def\utffouruniglyph#1#2#3#4%'
		file_out.puts '	{\expandafter\unicodechar\expandafter'
		file_out.puts '		{\the\numexpr262144*(#1-240)+4096*(`#2-128)+64*(`#3-128)+`#4-128\relax}}'
	end

	# macro to store mapping unicode -> font encoding
	file_out.puts
	file_out.puts '\def\addunichar #1 #2 {\expandafter\def\csname unichar@\number#1\endcsname{#2}}'
	file_out.puts
	file_out.puts '% \addunichar "unicode_code - ^^font_encoding_code'

	# all unicode characters in the encoding
	e.unicode_characters.sort.each do |code,c|
		file_out.puts sprintf("\\addunichar \"%04X ^^%02x \\lccode\"%02X=\"%02X %% %s - %s",
			c.code_uni, c.code_enc, c.code_enc, c.code_enc, [c.code_uni].pack('U'), c.name)
	end
	file_out.puts

	# make all the possible first characters active
	# output the definition into file
	e.unicode_characters_first_byte.sort.each do |first_byte_code,chars|
		byte = first_byte_code.hex
		size = chars[0].bytes.size
		# 2-byte: 0b11000000 <= byte < 0b11100000
		if size == 2 then
			str = "two"
		# 3-byte: 0b11100000 <= byte < 0b11110000
		elsif size == 3 then
			str = "three"
		# 4-byte: 0b11110000 <= byte < 0b11111000
		elsif size == 4 then
			str = "four"
		end
		file_out.puts sprintf("\\catcode\"%02X=\\active \\def^^%02x{\\utf%suniglyph{\"%02X}}", byte, byte, str, byte)
	end

	file_out.close
end

$encodings = ["ec", "qx", "t2a", "lmc", "il2", "il3", "l7x"]

$encodings.each do |encoding|
	# load encoding
	e = Encoding.new(encoding)

	# open file
	file_out = File.open("#{$output_data_dir}#{File::Separator}conv-utf8-#{encoding}.tex", "w")

	# copyright notice
	file_out.puts "% conv-utf8-#{encoding}.tex"
	file_out.puts "%"
	file_out.puts "% Conversion from UTF-8 to #{encoding.upcase},"
	file_out.puts "% used before loading hyphenation patterns for 8-bit TeX engines."
	file_out.puts "%"
	file_out.puts "% This file is part of hyph-utf8 package and autogenerated."
	file_out.puts "% See http://tug.org/tex-hyphen"
	file_out.puts "%"
	file_out.puts "% Copyright 2008-2013 TeX Users Group."
	file_out.puts "% You may freely use, modify and/or distribute this file."
	file_out.puts "% (But consider adapting the scripts if you need modifications.)"
	file_out.puts "%"

	e.unicode_characters_first_byte.sort.each do |first_byte_code,chars|
		# sorting all the second characters alphabetically
		chars.sort!{|x,y| x.code_uni <=> y.code_uni }
		# make all the possible first characters active
		# output the definition into file
		file_out.puts sprintf("\\catcode\"%02X=\\active", first_byte_code.hex)
	end
	file_out.puts "%"
	e.unicode_characters_first_byte.sort.each do |first_byte_code,chars|
		first_byte_code = first_byte_code.hex
		size = chars[0].bytes.size
		if size != 2 then
			throw "The encoding #{encoding} uses more than two bytes to encode characters"
		else

			file_out.puts sprintf("\\def^^%02x#1{%", first_byte_code)
			string_fi = ""
			for i in 1..(chars.size)
				uni_character = chars[i-1]
				enc_byte    = uni_character.code_enc
				enc_byte    = [ uni_character.code_enc ].pack('c').unpack('H2')
				ux_code     = sprintf("U+%04X", uni_character.code_uni)
				file_out.puts sprintf("\t\\ifx#1^^%02x^^%02x\\else %% %s - U+%04X - %s", uni_character.bytes[1], uni_character.code_enc, [uni_character.code_uni].pack('U'), uni_character.code_uni, uni_character.name)
				string_fi = string_fi + "\\fi"
			end

		# at least three bytes
		end
		file_out.puts "\t\\errmessage{Hyphenation pattern file corrupted or #{encoding} encoding not supported!}"
		file_out.puts string_fi + "}"
	end
	file_out.puts '%'
	file_out.puts '% ensure all the chars above have valid \lccode values'
	file_out.puts '%'
	e.lowercase_characters.each do |character|
		code = [ character.code_enc ].pack("c").unpack("H2").first.upcase
		# \lccode"FF="FF
		ux_code = sprintf("U+%04X", character.code_uni)
		file_out.puts "\\lccode\"#{code}=\"#{code} % #{[character.code_uni].pack('U')} - #{ux_code} - #{character.name}"
	end
	file_out.puts

	file_out.close
end


