#include <stdio.h>
#include <ctype.h>

#ifdef WIN32
#include <windows.h>
#endif

#include <zlib.h>

#include <ifractal_utils.h>
#include <if_string.h>
#include <parser.h>
#include <json.h>
#include <cgi.h>



//Keep this many previous recent characters for back reference:
#define OLDCHAR 15

// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE float ExtractNumber(const char* search, int lastcharoffset)
{
	char buffer[OLDCHAR + 5];
	int i = lastcharoffset;
	float flt = -1.0;

	memset(buffer, 0, sizeof(buffer));

	while ((i > 0) && (search[i] == ' '))
		i--;

	while ((i > 0) && (isdigit(search[i]) || search[i]=='.')) 
		i--;

	strncpy(buffer, search + i + 1, lastcharoffset - i);
	if (buffer[0] && sscanf(buffer, "%f", &flt))
		return(flt);

	return(-1.0);
}
// ////////////////////////////////////////////////////////////////////////// //

// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE int seen2(const char* search, char* recent)
{
	if (	(recent[OLDCHAR-3] == search[0]) && 
		(recent[OLDCHAR-2] == search[1]) &&
	 	((recent[OLDCHAR-1] == 0) || 
	 	(recent[OLDCHAR-1] == ' ') || 
		(recent[OLDCHAR-1] == 0x0d) || 
		(recent[OLDCHAR-1] == 0x0a)) &&
		((recent[OLDCHAR-4] == 0) || 
		(recent[OLDCHAR-4] == ' ') || 
		(recent[OLDCHAR-4] == 0x0d) || 
		(recent[OLDCHAR-4] == 0x0a))
	 )
	{
		return(1);
	}
	return(0);
}
// ////////////////////////////////////////////////////////////////////////// //


// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE int pdf2txt_process_element(_IN char* output, _IN size_t len, _OUT char **txtout)
{
	char *txt, oc[OLDCHAR], c;
	int intextobject = 0;
	int nextliteral = 0;
	int rbdepth = 0;
	int i, j, p;
	float num;

	txt = if_malloc(len);

	for (j = 0 ; j < OLDCHAR ; j++)
		oc[j] = ' ';

	for (p = 0, i = 0 ; i < len ; i++)
	{
		c = output[i];
		if (intextobject)
		{
			if ((rbdepth == 0) && (seen2("TD", oc)))
			{
				//Positioning.
				//See if a new line has to start or just a tab:
				num = ExtractNumber(oc, OLDCHAR - 5);
				if (num > 1.0)
					txt[p++] = '\n';

				if (num < 1.0)
					txt[p++] = '\t';
			}

			if ((rbdepth == 0) && (seen2("ET", oc)))
			{
				//End of a text object, also go to a new line.
				intextobject = 0;
				txt[p++] = '\n';
			}
			else if ((c == '(') && (rbdepth == 0) && !nextliteral)
			{
				//Start outputting text!
				rbdepth = 1;
				//See if a space or tab (>1000) is called for by looking
				//at the number in front of (
				num = ExtractNumber(oc, OLDCHAR-1);
				if (num > 0)
				{
					if (num > 1000.0)
					{
						txt[p++] = '\t';
					}
					else if (num > 100.0)
					{
						txt[p++] = ' ';
					}
				}
			}
			else if ((c == ')') && (rbdepth == 1) && !nextliteral) 
			{
				//Stop outputting text
				rbdepth = 0;
			}
			else if (rbdepth == 1) 
			{
				//Just a normal text character:
				if ((c == '\\') && !nextliteral)
				{
					//Only print out next character no matter what. Do not interpret.
					nextliteral = 1;
				}
				else
				{
					nextliteral = 0;
					if ((c >= ' ') && (c <= '~'))
						txt[p++] = c;

					if (c < 0)
						txt[p++] = c;
				}
			}
		}

		//Store the recent characters for when we have to go back for a number:
		for (j = 0 ; j < OLDCHAR - 1 ; j++)
			oc[j] = oc[j + 1];

		oc[OLDCHAR-1] = c;
		if (!intextobject)
		{
			if (seen2("BT", oc))
			{
				//Start of a text object:
				intextobject = 1;
			}
		}
	}

	*txtout = txt;
	txt[p] = 0;

	return(p);
}
// ////////////////////////////////////////////////////////////////////////// //


// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE int pdf2txt_find_string(_IN unsigned char *buffer, _IN char *search, _IN size_t buffersize)
{
	unsigned char *buffer0 = buffer;
	int fnd, i, len = strlen(search);

	for (fnd = 0 ; (!fnd) && ((buffer - buffer0 + len) < buffersize) ; buffer++)
	{
		fnd = 1;
		for (i = 0 ; i < len ; i++)
		{
			if (buffer[i] != search[i])
			{
				fnd = 0;
				break;
			}
		}

		if (fnd)
			return(buffer - buffer0);
	}

	return(-1);
}
// ////////////////////////////////////////////////////////////////////////// //

// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE void pdf2txt_show_hex(unsigned char *buf, size_t len)
{
	char line[17];
	int i;

	for (i = 0 ; i < len ; i++)
	{
		if (((i + 8) % 16) == 0)
			fprintf(stderr, "  ");

		fprintf(stderr, "%02X ", buf[i]);

		if ((buf[i] >= ' ') && (buf[i] <= '~'))
			line[i % 16] = buf[i];
		else
			line[i % 16] = '.';

		line[(i % 16) + 1] = 0;

		if (((i + 1) % 16) == 0)
			fprintf(stderr, "   %s\n", line);
	}
}
// ////////////////////////////////////////////////////////////////////////// //

// ////////////////////////////////////////////////////////////////////////// //
_PUBLIC int pdf2txt_process(
	_IN unsigned char *buffer, 
	_IN size_t length, 
	_IN int (*callback)(char *, size_t, void *), 
	_INOUT void *user_data)
{
	int stream_start, stream_end, outsize, len, r_init, r_fin;
	char delim_end[] = "endstream";
	char delim_start[] = "stream";
	unsigned char *buf = buffer;
	int l, qty_blocks = 0;
	char *outbuf, *txt;
	z_stream zstrm;

	len = length - (buf - buffer);
	for (qty_blocks = 0 ; (stream_start = pdf2txt_find_string(buf, delim_start, len)) > 0 ; qty_blocks++)
	{
		stream_end = pdf2txt_find_string(buf, delim_end, len);
		if (stream_end < stream_start)
			break;

		stream_start += strlen(delim_start);

		if ((buf[stream_start] == 0x0d) && (buf[stream_start+1] == 0x0a)) 
			stream_start += 2;
		else if (buf[stream_start] == 0x0a) 
			stream_start++;

		if ((buf[stream_end-2] == 0x0d) && (buf[stream_end-1] == 0x0a))
			stream_end -= 2;
		else if (buf[stream_end-1] == 0x0a)
			stream_end--;

		memset(&zstrm, 0, sizeof(zstrm));

		// Estima saida 20x maior
		outsize = 20 * (stream_end - stream_start);
		outbuf = if_malloc(outsize);

		zstrm.avail_in = stream_end - stream_start + 1;
		zstrm.avail_out = outsize;
		zstrm.next_in = (Bytef*) (buf + stream_start);
		zstrm.next_out = (Bytef*) outbuf;

		if (((r_init = inflateInit(&zstrm)) == Z_OK) && ((r_fin = inflate(&zstrm, Z_PARTIAL_FLUSH)) >= 0))
			l = pdf2txt_process_element(outbuf, zstrm.total_out, &txt);
		else
			l = pdf2txt_process_element((char *) (buf + stream_start), stream_end - stream_start + 1, &txt);
		//else
		//	fprintf(stderr, "Inflate - r_init: %d   r_fin: %d\n", r_init, r_fin);


		if (l > 0)
			callback(txt, l, user_data);

		if_free(txt);

		if_free(outbuf);

		buf += stream_end + strlen(delim_end);
		len = length - (buf - buffer);

		inflateEnd(&zstrm);
	}

	return(qty_blocks);
}
// ////////////////////////////////////////////////////////////////////////// //


// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE int pdf2txt_json_process_pag_iter(JSON_VALUE *temp_field, void *user_data)
{
	void **params = (void **) user_data;
	JSON_VALUE *pdf_pag = (JSON_VALUE *) params[0];
	JSON_VALUE *record = (JSON_VALUE *) params[1];
	JSON_VALUE *records = (JSON_VALUE *) params[2];
	char *name, *value, *tk[10], *indexes, *buf;
	STRING_BUFFER *strbuf;
	JSON_VALUE *aux;
	int index, l, i;

	aux = json_object_find(temp_field, "name");
	if (aux == NULL)
	{
		fprintf(stderr, "Campo 'name' nao encontrado no template.\n");
		return(0);
	}
	name = json_get_string(aux);

	aux = json_object_find(temp_field, "index");
	if (aux == NULL)
	{
		fprintf(stderr, "Campo 'index' nao encontrado no template.\n");
		return(0);
	}
	
	indexes = if_strdup(json_get_string(aux));
	l = tokenizer(',', indexes, tk, 10);

	for (i = 0, strbuf = string_new("") ; i < l ; i++)
	{
		index = atoi(tk[i]);
		aux = json_array_index(pdf_pag, index);
		if (aux == NULL)
		{
			fprintf(stderr, "Campo indice %d nao encontrado no pdf.\n", index);
			continue;
		}
		value = json_get_string(aux);
		string_append(strbuf, value);
		string_append(strbuf, " ");
	}

	if_free(indexes);

	value = string_get_text(strbuf);
	buf = if_malloc(BUFFER_LEN);
	strncpy(buf, value, BUFFER_LEN);

	parser_apply_functions(buf, records, record, NULL, NULL, temp_field);
	json_object_add(record, name, json_string_new(buf));

	if_free(buf);
	string_free(strbuf);

	return(0);
}
// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE int pdf2txt_json_process_pag_check_keys_iter(JSON_VALUE *key, void *user_data)
{
	JSON_VALUE *pdf_pag = (JSON_VALUE *) user_data;
	JSON_VALUE *aux;
	char *value;
	int index;
	
	aux = json_object_find(key, "index");
	if (aux == NULL)
		return(1);

	index = json_get_int(aux);
	
	aux = json_object_find(key, "value");
	if (aux == NULL)
		return(1);

	value = json_get_string(aux);

	aux = json_array_index(pdf_pag, index);
	if (aux == NULL)
		return(1);

	if (strcmp(value, json_get_string(aux)) != 0)
		return(1);

	return(0);
}
// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE int pdf2txt_json_process_pag_check_keys(JSON_VALUE *pdf_pag, JSON_VALUE *temp_pag)
{
	JSON_VALUE *keys;
	int q;

	keys = json_object_find(temp_pag, "keys");
	if (keys == NULL)
		return(1);

	q = json_array_iter(keys, pdf2txt_json_process_pag_check_keys_iter, pdf_pag);
	if (q < json_array_length(keys))
	{
		fprintf(stderr, "Chave nao corresponde.\n");
		return(0);
	}

	return(1);
}
// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE int pdf2txt_json_process_pag(
	JSON_VALUE *pdf_pag, 
	JSON_VALUE *records, 
	JSON_VALUE *record, 
	JSON_VALUE *temp_pag)
{
	JSON_VALUE *fields;
	void *params[] = {pdf_pag, record, records};

	if (!pdf2txt_json_process_pag_check_keys(pdf_pag, temp_pag))
		return(-1);

	fields = json_object_find(temp_pag, "fields");
	if (fields == NULL)
		return(-1);

	json_array_iter(fields, pdf2txt_json_process_pag_iter, params);

	return(0);
}
// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE int pdf2txt_json_process(JSON_VALUE *pdf_json, JSON_VALUE *records, JSON_VALUE *temp_record)
{
	JSON_VALUE *record, *temp_pag, *pdf_pag;
	int pag, qty_rec_pags;

	qty_rec_pags = json_array_length(temp_record);

	for (pag = 0 ; pag < json_array_length(pdf_json) ; pag++)
	{
		if ((pag % qty_rec_pags) == 0)
		{
			record = json_object_new(2);
			json_array_add(records, record);
		}

		pdf_pag = json_array_index(pdf_json, pag);
		temp_pag = json_array_index(temp_record, pag % qty_rec_pags);

		pdf2txt_json_process_pag(pdf_pag, records, record, temp_pag);
	}

	return(0);
}
// ////////////////////////////////////////////////////////////////////////// //
_PRIVATE JSON_VALUE * pdf2txt_json_in(_IN JSON_VALUE *json, _IN JSON_VALUE *template)
{
	char *list[] = {"_","pdf2json", NULL,NULL};
	JSON_VALUE *result, *globals, *records, *temp_record;

	result = json_object_new(2);

	globals = json_object_find(template, "globals");
	if (globals == NULL)
		globals = json_object_new_list(list);
	else
		globals = json_clone(globals);

	json_object_add(result, "globals", globals);

	records = json_array_new(2);
	json_object_add(result, "records", records);

	temp_record = json_object_find(template, "record");
	if (temp_record == NULL)
	{
		fprintf(stderr, "Template - campo 'record' nao localizado.\n");
		return(result);
	}

	pdf2txt_json_process(json, records, temp_record);
	return(result);
}
// ////////////////////////////////////////////////////////////////////////// //
_CALLBACK int pdf2txt_json_pag(char *buf, size_t len, void *user_data)
{
	JSON_VALUE *json = (JSON_VALUE *) user_data;
	JSON_VALUE *pag;
	char *p, *q;

	pag = json_array_new(2);
	for (p = buf, q = p ; *q != 0 ; q++)
	{
		if (*q == '\n')
		{
			*q = 0;
			json_array_add(pag, json_string_new(p));
			p = q + 1;
		}
	}

	json_array_add(pag, json_string_new(p));

	json_array_add(json, pag);

	return(0);
}
// ////////////////////////////////////////////////////////////////////////// //
_PUBLIC JSON_VALUE * pdf2txt_json(_IN unsigned char *buffer, _IN size_t length, _IN JSON_VALUE *template)
{
	JSON_VALUE *json, *result;

	if (buffer == NULL)
		return(NULL);

	json = json_array_new(2);
	pdf2txt_process(buffer, length, pdf2txt_json_pag, json);

	if (template == NULL)
		return(json);

	parser_init_modules();

	result = pdf2txt_json_in(json, template);
	json_value_free(json);

	parser_finalize_modules();

	return(result);
}
// ////////////////////////////////////////////////////////////////////////// //


// ////////////////////////////////////////////////////////////////////////// //
_PUBLIC char * pdf2json(_IN unsigned char *buffer, _IN size_t len, _IN char *template)
{
	JSON_VALUE *json, *temp_json = NULL;
	char *aux;

	if (template != NULL)
	{
		temp_json = json_parse_mem(template);
		if (temp_json == NULL)
			fprintf(stderr, "Template invalido.\n");
	}

	json = pdf2txt_json(buffer, len, temp_json);
	aux = json_serialize(json);
	json_value_free(json);
	json_value_free(temp_json);

	return(aux);
}
// ////////////////////////////////////////////////////////////////////////// //



#ifdef STANDALONE
// ////////////////////////////////////////////////////////////////////////// //
_CALLBACK int pdf2txt_txt(char *buf, size_t len, void *user_data)
{
	int *pag = (int *) user_data;
	char *p, *q, *aux;
	int i;

	fprintf(stdout, "\n-------------------------- BLOCO %03d -------------------------- (%d)\n\n", *pag, (int) len);

	aux = if_strdup(buf);
	for (i = 0, p = aux, q = p ; *q != 0 ; q++)
	{
		if (*q == '\n')
		{
			*q = 0;
			fprintf(stdout, "%03d.%03d '%s'\n", *pag, i, p);
			p = q + 1;
			i++;
		}
	}
	if_free(aux);

	*pag += 1;

	return(0);
}
// ////////////////////////////////////////////////////////////////////////// //


#define DESCRIPT	"Gera txt/json a partir de um PDF.\n"

IF_GETOPT configs[] = {
	{0, 'p', IF_GETOPT_TYPE_STRING, "pdf", "-", 0, "Arquivo PDF ou '-' para STDIN."},
	{0, 't', IF_GETOPT_TYPE_NONE, "txt", "", 0, "Gera txt em STDOUT."},
	{0, 'j', IF_GETOPT_TYPE_NONE, "json", "", 0, "Gera JSON em STDOUT."},
	{0, 'c', IF_GETOPT_TYPE_STRING, "template", "", 0, "Arquivo template JSON."},
	{0, 0, 0, 0, 0, 0, 0}
};

// ////////////////////////////////////////////////////////////////////////// //
int main(int argc, char* argv[])
{
	char *filename, *buf, *aux, *template;
	int len, r;
	FILE *fd;

	r = if_getopt(configs, argc, argv);
	if (	(r < 0) || 
		(!if_getopt_isChecked(configs, "txt") && !if_getopt_isChecked(configs, "json"))
		)
	{
		if_help_header(argv[0], DESCRIPT);
		fprintf(stderr, "Ajuda:\n");
		if_getopt_help(configs);

		fprintf(stderr, "\nUso:\n");
		fprintf(stderr, "\tshell$ %s -p <PDF> -t\n", argv[0]);

		fprintf(stderr, "\nExemplos:\n");
		fprintf(stderr, "\tshell$ %s -p arquivo.pdf -j > arquivo.json\n", argv[0]);
		fprintf(stderr, "\tshell$ %s -p arquivo.pdf -c template_pdf.json -j > result.json\n", argv[0]);

		fprintf(stderr, "\n");

		return(r);
	}

	filename = if_getopt_getValue(configs, "pdf");
	if (filename[0] == '-')
		fd = stdin;
	else
		fd = fopen(filename, "r");

	if (fd == NULL)
		return(2);
	
	len = cgi_get_content(fd, &buf);
	fclose(fd);

	if (if_getopt_isChecked(configs, "template"))
	{
		filename = if_getopt_getValue(configs, "template");
		fd = fopen(filename, "r");
		if (fd == NULL)
			return(3);
		
		cgi_get_content(fd, &template);
		fclose(fd);
	}
	else
		template = NULL;

	if (if_getopt_isChecked(configs, "txt"))
	{
		r = 1;

		//char bom[] = {0xEF, 0xBB, 0xBF};
		//fwrite(bom, 3, 1, stdout);

		pdf2txt_process((unsigned char *) buf, len, pdf2txt_txt, &r);
	}

	if (if_getopt_isChecked(configs, "json"))
	{
		aux = pdf2json((unsigned char *) buf, len, template);
		fprintf(stdout, "%s\n", aux);
		if_free(aux);
	}

	if_free(buf);

	if (template != NULL)
		if_free(template);

	return(0);
}
// ////////////////////////////////////////////////////////////////////////// //
#endif
