java - PDFBox TextPosition x, y and width, height off by factor of 2 -
i wrote short library extract position of anchor text pdf document can later render image bufferedimage , layer html form on it. works, have scale x, y , width , height 2 in order make work correctly. testing rendering image bufferedimage , drawing bounding boxes in red. basically...why off factor of 2...and can count on factor constant? , naturally, realize if size of image changes x,y coords , width, height scale appropriately. convert image perhaps scaling it?
here code:
anchortextripper.java
import java.awt.rectangle; import java.io.ioexception; import java.util.hashmap; import org.apache.pdfbox.util.pdftextstripper; import org.apache.pdfbox.util.textposition; public class anchortextripper extends pdftextstripper { protected enum scanstate { init, searching, found_possible, scanning_anchor, done } protected hashmap<string, rectangle> anchors = new hashmap<string, rectangle>(); // scanning variables protected scanstate state = scanstate.init; protected textposition lastfoundanchor; protected stringbuilder lastfoundanchortext; protected double lastwidth; protected rectangle lastfoundanchorrect; public anchortextripper() throws ioexception { super(); this.setsortbyposition(true); } /** * method provided event interface allow subclass perform * specific functionality when text needs processed. * * @param text * text processed */ @override protected void processtextposition(textposition text) { switch(state) { case init: state = scanstate.searching; lastfoundanchor = null; lastfoundanchortext = new stringbuilder(); lastwidth = 0.0; lastfoundanchorrect = null; break; case searching: if (text.getcharacter().equals("$")) { state = scanstate.found_possible; lastfoundanchor = text; } break; case found_possible: if (text.getcharacter().equals("{")) { state = scanstate.scanning_anchor; } break; case scanning_anchor: if (text.getcharacter().equals("}")) { state = scanstate.done; break; } lastfoundanchortext.append(text.getcharacter()); break; case done: system.out.println(string.format("%f, %f (%f, %f) [%f, %f]", lastfoundanchor.getx(), lastfoundanchor.gety(), lastfoundanchor.getxscale(), lastfoundanchor.getyscale(), lastfoundanchor.getwidth(), lastfoundanchor.getheight())); lastfoundanchorrect = new rectangle((int)math.round(lastfoundanchor.getx() * 2) , (int)math.round((lastfoundanchor.gety() * 2) - lastfoundanchor.getheight() * 2), (int)math.round(lastwidth) * 2, (int)math.round(lastfoundanchor.getheight() * 2)); anchors.put(lastfoundanchortext.tostring(), lastfoundanchorrect); state = scanstate.init; break; } if (state != scanstate.searching) { lastwidth += text.getwidth(); } } }
anchortextlocatorservice.java
import org.apache.pdfbox.exceptions.cryptographyexception; import org.apache.pdfbox.pdmodel.pddocument; import org.apache.pdfbox.pdmodel.pdpage; import org.apache.pdfbox.pdmodel.common.pdstream; public class anchortextlocatorservice { protected anchortextripper ripper = new anchortextripper(); public anchortextlocatorservice(string filename) throws ioexception { pddocument document = null; try { document = pddocument.load(filename); if (document.isencrypted()) { document.decrypt(""); } @suppresswarnings("unchecked") list<pdpage> allpages = document.getdocumentcatalog().getallpages(); (int = 0; < allpages.size(); i++) { pdpage page = (pdpage) allpages.get(i); pdstream contents = page.getcontents(); if (contents != null) { ripper.processstream(page, page.findresources(), page.getcontents().getstream()); } } } catch (cryptographyexception e) { // todo auto-generated catch block e.printstacktrace(); } { if (document != null) { document.close(); } } } public hashmap<string, rectangle> getanchors() { return ripper.anchors; } public rectangle getanchorrect(string anchortext) { return ripper.anchors.get(anchortext); } }
application.java
import java.awt.color; import java.awt.graphics2d; import java.awt.rectangle; import java.awt.image.bufferedimage; import java.io.file; import java.util.map.entry; import javax.imageio.imageio; import org.apache.pdfbox.pdmodel.pddocument; import org.apache.pdfbox.pdmodel.pdpage; public class application { /** * print documents data. * * @param args * command line arguments. * * @throws exception * if there error parsing document. */ public static void main(string[] args) throws exception { pddocument document = pddocument.load("test.pdf"); if (document.isencrypted()) { document.decrypt(""); } pdpage page = (pdpage)document.getdocumentcatalog().getallpages().get(0); bufferedimage bi = page.converttoimage(); anchortextlocatorservice ats = new anchortextlocatorservice("test.pdf"); (entry<string, rectangle> anchor : ats.getanchors().entryset()) { system.out.println(anchor.getkey() + " => " + anchor.getvalue()); graphics2d g = (graphics2d)bi.getgraphics(); g.setcolor(color.red); g.drawrect(anchor.getvalue().x, anchor.getvalue().y, anchor.getvalue().width, anchor.getvalue().height); } imageio.write(bi, "png", new file("test.png")); } }
https://pdfbox.apache.org/apidocs/org/apache/pdfbox/pdmodel/pdpage.html
sorry...i read doc...should have done first. pdpage::converttoimage() outputs @ double resolution. might helpful else.
Comments
Post a Comment