java - PDFBox TextPosition x, y and width, height off by factor of 2 -

i wrote short library extract position of anchor text pdf document can later render image bufferedimage , layer html form on it. works, have scale x, y , width , height 2 in order make work correctly. testing rendering image bufferedimage , drawing bounding boxes in red. basically...why off factor of 2...and can count on factor constant? , naturally, realize if size of image changes x,y coords , width, height scale appropriately. convert image perhaps scaling it?

here code:

anchortextripper.java

import java.awt.rectangle; import java.io.ioexception; import java.util.hashmap;  import org.apache.pdfbox.util.pdftextstripper; import org.apache.pdfbox.util.textposition;  public class anchortextripper extends pdftextstripper {     protected enum scanstate {         init,         searching,         found_possible,         scanning_anchor,         done     }      protected hashmap<string, rectangle> anchors = new hashmap<string, rectangle>();      // scanning variables     protected scanstate state = scanstate.init;     protected textposition lastfoundanchor;     protected stringbuilder lastfoundanchortext;     protected double lastwidth;     protected rectangle lastfoundanchorrect;      public anchortextripper() throws ioexception {         super();         this.setsortbyposition(true);     }      /**      * method provided event interface allow subclass perform      * specific functionality when text needs processed.      *      * @param text      *            text processed      */     @override     protected void processtextposition(textposition text) {         switch(state) {         case init:             state = scanstate.searching;             lastfoundanchor = null;             lastfoundanchortext = new stringbuilder();             lastwidth = 0.0;             lastfoundanchorrect = null;             break;         case searching:             if (text.getcharacter().equals("$")) {                 state = scanstate.found_possible;                 lastfoundanchor = text;             }             break;         case found_possible:             if (text.getcharacter().equals("{")) {                 state = scanstate.scanning_anchor;             }             break;         case scanning_anchor:             if (text.getcharacter().equals("}")) {                 state = scanstate.done;                 break;             }              lastfoundanchortext.append(text.getcharacter());             break;         case done:             system.out.println(string.format("%f, %f (%f, %f) [%f, %f]", lastfoundanchor.getx(), lastfoundanchor.gety(), lastfoundanchor.getxscale(), lastfoundanchor.getyscale(), lastfoundanchor.getwidth(), lastfoundanchor.getheight()));              lastfoundanchorrect = new rectangle((int)math.round(lastfoundanchor.getx() * 2) , (int)math.round((lastfoundanchor.gety() * 2) - lastfoundanchor.getheight() * 2), (int)math.round(lastwidth) * 2, (int)math.round(lastfoundanchor.getheight() * 2));             anchors.put(lastfoundanchortext.tostring(), lastfoundanchorrect);             state = scanstate.init;             break;         }          if (state != scanstate.searching) {             lastwidth += text.getwidth();         }     } }

anchortextlocatorservice.java

import org.apache.pdfbox.exceptions.cryptographyexception; import org.apache.pdfbox.pdmodel.pddocument; import org.apache.pdfbox.pdmodel.pdpage; import org.apache.pdfbox.pdmodel.common.pdstream;  public class anchortextlocatorservice {     protected anchortextripper ripper = new anchortextripper();      public anchortextlocatorservice(string filename) throws ioexception {         pddocument document = null;          try {             document = pddocument.load(filename);             if (document.isencrypted()) {                 document.decrypt("");             }              @suppresswarnings("unchecked")             list<pdpage> allpages = document.getdocumentcatalog().getallpages();              (int = 0; < allpages.size(); i++) {                 pdpage page = (pdpage) allpages.get(i);                 pdstream contents = page.getcontents();                 if (contents != null) {                     ripper.processstream(page, page.findresources(), page.getcontents().getstream());                 }             }         } catch (cryptographyexception e) {             // todo auto-generated catch block             e.printstacktrace();         } {             if (document != null) {                 document.close();             }         }     }        public hashmap<string, rectangle> getanchors() {         return ripper.anchors;     }      public rectangle getanchorrect(string anchortext) {         return ripper.anchors.get(anchortext);     } }

application.java

import java.awt.color; import java.awt.graphics2d; import java.awt.rectangle; import java.awt.image.bufferedimage; import java.io.file; import java.util.map.entry;  import javax.imageio.imageio;  import org.apache.pdfbox.pdmodel.pddocument; import org.apache.pdfbox.pdmodel.pdpage;  public class application {       /**      * print documents data.      *      * @param args      *            command line arguments.      *      * @throws exception      *             if there error parsing document.      */     public static void main(string[] args) throws exception {         pddocument document = pddocument.load("test.pdf");         if (document.isencrypted()) {             document.decrypt("");         }          pdpage page = (pdpage)document.getdocumentcatalog().getallpages().get(0);         bufferedimage bi = page.converttoimage();          anchortextlocatorservice ats = new anchortextlocatorservice("test.pdf");          (entry<string, rectangle> anchor : ats.getanchors().entryset()) {             system.out.println(anchor.getkey() + " => " + anchor.getvalue());              graphics2d g = (graphics2d)bi.getgraphics();             g.setcolor(color.red);             g.drawrect(anchor.getvalue().x, anchor.getvalue().y, anchor.getvalue().width, anchor.getvalue().height);         }          imageio.write(bi, "png", new file("test.png"));     } }

https://pdfbox.apache.org/apidocs/org/apache/pdfbox/pdmodel/pdpage.html

sorry...i read doc...should have done first. pdpage::converttoimage() outputs @ double resolution. might helpful else.

Search This Blog

Back

java - PDFBox TextPosition x, y and width, height off by factor of 2 -

Comments

Post a Comment

Popular posts from this blog

c# - HttpResponseMessage System.InvalidOperationException -

sql - Postgresql error: "failed to find conversion function from unknown to text" -

how to remove index.php file from url in codeigniter? -