/**
 * @Author Eric Jensen
 * Date: July, 2000
 * Copyright (C) 2000 Eric Jensen <ej@ir.iit.edu>
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 *
 * This is a wrapper for the Pj PDF parser...  It is my first use of Pj
 * and I never had time to clean it up, so don't take it too seriously.  :)
 *
 */

import java.io.*;
import java.util.*;

import com.etymon.pj.*;
import com.etymon.pj.object.*;
import com.etymon.pj.exception.*;

/**
 * This is a wrapper for the Pj PDF parser
 */
public class PjWrapper {

    Pdf pdf;
    PjCatalog catalog;
    PjPagesNode rootPage;

    public PjWrapper(String filename) throws IOException, PjException {
	pdf = new Pdf(filename);

	// hopefully the catalog can never be a reference...
	catalog = (PjCatalog) pdf.getObject(pdf.getCatalog());

	// root node of pages tree is specified by a reference
	// in the catalog
	rootPage = (PjPagesNode) pdf.resolve(catalog.getPages());
    }

    public static void main (String [] args) throws IOException, PjException
    {
	PjWrapper testWrapper = new PjWrapper(args[0]);
	LinkedList textList = testWrapper.getAllText();
    }

    /**
     * Returns as much text as we can extract from the PDF.
     * This currently includes:  
     *
     * NOTE: Pj does not support LZW, so some text in some PDF's may not
     *       be indexable
     */
    public LinkedList getAllText() throws PjException {
	LinkedList stringList = new LinkedList();
	Iterator streamIter = getAllContentsStreams().iterator();
	PjStream stream;
	String streamData;
	String streamText;
	boolean moreData;
	int textStart, textEnd;

	System.out.println("Going through streams...");

	while(streamIter.hasNext()) {

	    System.out.println("Getting next stream");
	    stream = (PjStream) streamIter.next();
	    System.out.println("Adding text from stream with filter: " + getFilterString(stream));
	    stream = stream.flateDecompress();

	    System.out.println("Adding text from stream with filter after decompress: " + getFilterString(stream));

	    streamData = new String(stream.getBuffer());

	    streamText = new String();
	    moreData = true;
	    textStart = textEnd = 0;

	    while(moreData) {
		if ((textStart = streamData.indexOf('(', textEnd + 1)) < 0) {
		    moreData = false;
		    break;
		}

		if ((textEnd = streamData.indexOf(')', textStart + 1)) < 0) {
		    moreData = false;
		    break;
		}

		try {
		    streamText += PjString.decodePdf(streamData.substring(textStart, textEnd + 1));
		} catch (Exception e) {
		    System.out.println("malformed string: " + streamData.substring(textStart, textEnd + 1));
		}
	    }

	    System.out.println("Text from stream is: " + streamText);

	    if (streamText.length() > 0)
		stringList.add(streamText); 
	}

	return stringList;
    }

    public static String getFilterString(PjStream stream) throws PjException {
	    String filterString = new String();
	    PjObject filter;
	    System.out.println("getting filter from dictionary");
	    if ((filter = stream.getStreamDictionary().getFilter()) == null) {
		System.out.println("Got null filter");
		return "";
	    }
	    System.out.println("got it");

	    // filter should either be a name or an array of names
	    if (filter instanceof PjName) {
		System.out.println("getting filter string from simple name");
		filterString = ((PjName) filter).getString();
	    } else {
		System.out.println("getting filter string from array of names");
		Iterator nameIter;
		Vector nameVector;

		if ((nameVector = ((PjArray) filter).getVector()) == null) {
		    System.out.println("got null vector for list of names");
		    return "";
		}

		nameIter = nameVector.iterator();

		while (nameIter.hasNext()) {
		    filterString += ((PjName) nameIter.next()).getString();

		    if (nameIter.hasNext())
			filterString += " ";
		}
	    }

	    System.out.println("got filter string");

	    return filterString;
    }

    /**
     * Performs a post-order traversal of the pages tree
     * from the root node and gets all of the contents streams
     * @returns a list of all the contents of all the pages
     */
    public LinkedList getAllContentsStreams() throws InvalidPdfObjectException {
	return getContentsStreams(getAllPages());
    }

    /**
     * Get contents streams from the list of PjPage objects
     * @returns a list of all the contents of the pages
     */
    public LinkedList getContentsStreams(LinkedList pages) throws InvalidPdfObjectException {
	LinkedList streams = new LinkedList();
	Iterator pageIter = pages.iterator();
	PjObject contents;

	while(pageIter.hasNext()) {
	    contents = pdf.resolve(((PjPage)pageIter.next()).getContents());

	    // should only be a stream or an array of streams (or refs to streams)
	    if (contents instanceof PjStream)
		streams.add(contents);
	    else {
		Iterator streamsIter = ((PjArray)contents).getVector().iterator();

		while(streamsIter.hasNext())
		    streams.add(pdf.resolve((PjObject)streamsIter.next()));
	    }
	}

	return streams;
    }

    /**
     * Performs a post-order traversal of the pages tree
     * from the root node.
     * @returns a list of all the PjPage objects
     */
    public LinkedList getAllPages() throws InvalidPdfObjectException {
	LinkedList pages = new LinkedList();
	getPages(rootPage, pages);
	return pages;
    }

    /**
     * Performs a post-order traversal of the pages tree
     * from the node passed to it.
     * @returns a list of all the PjPage objects under node
     */
    public void getPages(PjObject node, LinkedList pages) throws InvalidPdfObjectException {

	PjPagesNode pageNode = null;

	// let's hope pdf's don't have pointers to pointers
	if (node instanceof PjReference)
	    pageNode = (PjPagesNode) pdf.resolve(node);
	else
	    pageNode = (PjPagesNode) node;

	if (pageNode instanceof PjPage) {
	    pages.add(pageNode);
	    return;
	}

	// kids better be an array and not a reference to one
	Iterator kidIterator = ((PjArray) ((PjPages) pageNode).getKids()).getVector().iterator();

	while(kidIterator.hasNext()) {
	    getPages((PjObject) kidIterator.next(), pages);
	}
    }
    
    public Pdf getPdf() {
	return pdf;
    }
}

