swing - Java: How to extract text by a selected area from a PDF file using iText? -
i working on program extract texts pdf file in specific area, using java , itext library. right now, can extract data taping area coordinations using code:
import java.io.ioexception; import com.itextpdf.text.rectangle; import com.itextpdf.text.pdf.pdfreader; import com.itextpdf.text.pdf.parser.filteredtextrenderlistener; import com.itextpdf.text.pdf.parser.locationtextextractionstrategy; import com.itextpdf.text.pdf.parser.pdftextextractor; import com.itextpdf.text.pdf.parser.regiontextrenderfilter; import com.itextpdf.text.pdf.parser.renderfilter; import com.itextpdf.text.pdf.parser.textextractionstrategy; /** * créer par malek boubakri le 03/06/2015 à 15:45. */ public class extractpagecontentarea { // public void parsepdf(float x,float y,float width,float height,string pdf) throws ioexception { pdfreader reader = new pdfreader(pdf); rectangle rect = new rectangle(x, y, width, height); renderfilter filter = new regiontextrenderfilter(rect); textextractionstrategy strategy; (int = 1; <= reader.getnumberofpages(); i++) { strategy = new filteredtextrenderlistener(new locationtextextractionstrategy(), filter); system.out.println(pdftextextractor.gettextfrompage(reader, i, strategy)); } reader.close(); } }
and code can draw rectangle , save needed coordinations using:
import java.awt.borderlayout; import java.awt.graphics; import java.awt.rectangle; import java.awt.event.mouseevent; import java.awt.event.mouselistener; import java.awt.event.mousemotionlistener; import java.util.arraylist; import javax.swing.jframe; import javax.swing.jlabel; import javax.swing.swingconstants; public class mousetracker extends jframe implements mouselistener, mousemotionlistener { private static final long serialversionuid = 1l; private final jlabel mouseposition; int x1, x2, y1, y2; int w, h; private final jlabel recstart; private final jlabel recstop; private final jlabel cords; // set gui , register mouse event handlers private final arraylist< rectangle > rectangles = new arraylist< rectangle >(); private boolean isnewrect = true; public mousetracker() { super( "rectangle drawer" ); this.mouseposition = new jlabel(); this.mouseposition.sethorizontalalignment( swingconstants.center ); getcontentpane().add( this.mouseposition, borderlayout.center ); jlabel text1 = new jlabel(); text1.settext( "at center mouse pointer's coordinates displayed." ); getcontentpane().add( text1, borderlayout.south ); this.recstart = new jlabel(); getcontentpane().add( this.recstart, borderlayout.west ); this.recstop = new jlabel(); getcontentpane().add( this.recstop, borderlayout.east ); this.cords = new jlabel(); getcontentpane().add( this.cords, borderlayout.north ); addmouselistener( ); // listens own mouse , addmousemotionlistener( ); // mouse-motion events setsize( 800, 600 ); setvisible( true ); } // mouselistener event handlers // handle event when mouse released after press public void mouseclicked( final mouseevent event ) { this.mouseposition.settext( "clicked @ [" + event.getx() + ", " + event.gety() + "]" ); repaint(); } // handle event when mouse pressed public void mousepressed( final mouseevent event ) { this.mouseposition.settext( "pressed @ [" + ( this.x1 = event.getx() ) + ", " + ( this.y1 = event.gety() ) + "]" ); this.recstart.settext( "start: [" + this.x1 + ", " + this.y1 + "]" ); repaint(); } // handle event when mouse released after dragging public void mousereleased( final mouseevent event ) { this.mouseposition.settext( "released @ [" + ( this.x2 = event.getx() ) + ", " + ( this.y2 = event.gety() ) + "]" ); this.recstop.settext( "end: [" + this.x2 + ", " + this.y2 + "]" ); rectangle rectangle = getrectanglefrompoints(); this.rectangles.add( rectangle ); this.w = this.h = this.x1 = this.y1 = this.x2 = this.y2 = 0; this.isnewrect = true; repaint(); } private rectangle getrectanglefrompoints() { int width = this.x1 - this.x2; int height = this.y1 - this.y2; rectangle rectangle = new rectangle( width < 0 ? this.x1 : this.x2, height < 0 ? this.y1 : this.y2, math.abs( width ), math.abs( height ) ); return rectangle; } // handle event when mouse enters area public void mouseentered( final mouseevent event ) { this.mouseposition.settext( "mouse entered @ [" + event.getx() + ", " + event.gety() + "]" ); repaint(); } // handle event when mouse exits area public void mouseexited( final mouseevent event ) { this.mouseposition.settext( "mouse outside window" ); repaint(); } // mousemotionlistener event handlers // handle event when user drags mouse button pressed public void mousedragged( final mouseevent event ) { this.mouseposition.settext( "dragged @ [" + ( this.x2 = event.getx() ) + ", " + ( this.y2 = event.gety() ) + "]" ); // call repaint calls paint repaint(); this.isnewrect = false; repaint(); } // handle event when user moves mouse public void mousemoved( final mouseevent event ) { this.mouseposition.settext( "moved @ [" + event.getx() + ", " + event.gety() + "]" ); repaint(); } @override public void paint( final graphics g ) { super.paint( g ); // clear frame surface g.drawstring( "start rec here", this.x1, this.y1 ); g.drawstring( "end rec here", this.x2, this.y2 ); rectangle newrectangle = getrectanglefrompoints(); if ( !this.isnewrect ) { g.drawrect( newrectangle.x, newrectangle.y, newrectangle.width, newrectangle.height ); } for( rectangle rectangle : this.rectangles ) { g.drawrect( rectangle.x, rectangle.y, rectangle.width, rectangle.height ); } this.cords.settext( "w = " + this.w + ", h = " + this.h ); } public static void main( final string args[] ) { mousetracker application = new mousetracker(); application.setdefaultcloseoperation( jframe.exit_on_close ); } }
i want use coordinations specify area in pdf file, dont know how can merge 2 fonctions, how put drawing space above document , how fit rect coordinations text coordinations.
how draw above panel?
should convert pdf image , put behind that?
if should, please can suggest , free ocr library!
please if blur comment! can put me in road! cause i'm lost.
waiting help..and thanks( sorry bad english )
you have interesting question , challenging project. "answer" may provide useful ideas, not finished solution.
you use called glass pane draw on top of other components.
the important thing think need decide on libraries optimal project. itext library , provides sorts of pdf functionality, text extraction show in question.
but, far know, there no support pdf viewing in itext. use library icepdf (see this example). nice if icepdf support text extraction well, use 1 library instead of making icepdf work itext or ocr (and handling issues zooming pdf in icepdf , compensating when you're getting text).
i'm not sure whether can extract text icepdf, itext still used in example code below:
// file extractselectionfrompdf.java import com.itextpdf.text.rectangle; import com.itextpdf.text.pdf.pdfreader; import com.itextpdf.text.pdf.parser.*; import java.awt.container; import java.awt.point; import java.io.ioexception; import javax.swing.*; public class extractselectionfrompdf { private static string filepath = "[file path pdf file]"; private pdfviewer pdfviewer; public static void main(final string[] arguments) { swingutilities.invokelater(() -> new extractselectionfrompdf().launchgui()); } private void launchgui() { final jframe frame = new jframe("extract selected text pdf"); frame.setdefaultcloseoperation(windowconstants.exit_on_close); final container contentpane = frame.getcontentpane(); pdfviewer = new pdfviewer(); contentpane.add(pdfviewer); pdfviewer.opendocument(filepath); final customglasspane customglasspane = new customglasspane(this, contentpane); frame.setglasspane(customglasspane); customglasspane.setvisible(true); frame.setbounds(60, 10, 1800, 1000); frame.setvisible(true); } public void handleselection(final point topleft, final point bottomright) { final int width = bottomright.x - topleft.x; final int height = bottomright.y - topleft.x; final string text = parsepdf(topleft.x, topleft.y, width, height, filepath); system.out.println("text: " + text); } public string parsepdf(final int x, final int y, final int width, final int height, final string pdffilepath) { string text = null; try { final pdfreader pdfreader = new pdfreader(pdffilepath); final int pagenumber = pdfviewer.getcurrentpagenumber() + 1; system.out.println("page number: " + pagenumber); final rectangle selection = new rectangle(x, y, width, height); final renderfilter renderfilter = new regiontextrenderfilter(selection); final locationtextextractionstrategy delegate = new locationtextextractionstrategy(); final textextractionstrategy extractionstrategy = new filteredtextrenderlistener(delegate, renderfilter); text = pdftextextractor.gettextfrompage(pdfreader, pagenumber, extractionstrategy); pdfreader.close(); } catch (final ioexception e) { e.printstacktrace(); } return text; } } // file pdfviewer.java import java.util.resourcebundle; import javax.swing.*; import org.icepdf.ri.common.*; import org.icepdf.ri.common.views.documentviewcontroller; import org.icepdf.ri.util.propertiesmanager; public class pdfviewer extends jpanel { private final swingcontroller controller; public pdfviewer() { controller = new swingcontroller(); controller.setisembeddedcomponent(true); final string bundlename = propertiesmanager.default_message_bundle; final resourcebundle messagebundle = resourcebundle.getbundle(bundlename); final properties systemproperties = system.getproperties(); final propertiesmanager properties = new propertiesmanager(systemproperties, messagebundle); properties.set(propertiesmanager.property_default_zoom_level, "1"); final swingviewbuilder factory = new swingviewbuilder(controller, properties); final documentviewcontroller viewcontroller = controller.getdocumentviewcontroller(); viewcontroller.setannotationcallback(new myannotationcallback(viewcontroller)); final jscrollpane scrollpane = new jscrollpane(factory.buildviewerpanel()); final int horizontalpolicy = scrollpaneconstants.horizontal_scrollbar_always; final int verticalpolicy = scrollpaneconstants.vertical_scrollbar_always; scrollpane.sethorizontalscrollbarpolicy(horizontalpolicy); scrollpane.setverticalscrollbarpolicy(verticalpolicy); add(scrollpane); } public void opendocument(final string filepath) { controller.opendocument(filepath); } public int getcurrentpagenumber() { return controller.getcurrentpagenumber(); } } // file customglasspane.java import java.awt.*; import javax.swing.jcomponent; public class customglasspane extends jcomponent { private point topleftpoint; private point bottomrightpoint; public customglasspane(final extractselectionfrompdf extractselectionfrompdf, final container contentpane) { final mouseeventslistener listener = new mouseeventslistener(extractselectionfrompdf, this, contentpane); addmouselistener(listener); addmousemotionlistener(listener); } public void setselection(final point topleftpoint, final point bottomrightpoint) { this.topleftpoint = topleftpoint; this.bottomrightpoint = bottomrightpoint; } protected void paintcomponent(final graphics graphics) { if (topleftpoint != null && bottomrightpoint != null) { graphics.setcolor(color.black); graphics.drawrect(topleftpoint.x, topleftpoint.y, bottomrightpoint.x - topleftpoint.x, bottomrightpoint.y - topleftpoint.y); } } } // file mouseeventslistener.java import java.awt.*; import java.awt.event.mouseevent; import javax.swing.swingutilities; import javax.swing.event.mouseinputadapter; public class mouseeventslistener extends mouseinputadapter { private extractselectionfrompdf extractselectionfrompdf; private customglasspane customglasspane; private container contentpane; private point topleftpoint; private point bottomrightpoint; public mouseeventslistener(final extractselectionfrompdf extractselectionfrompdf, final customglasspane customglasspane, final container contentpane) { this.extractselectionfrompdf = extractselectionfrompdf; this.customglasspane = customglasspane; this.contentpane = contentpane; } public void mousepressed(final mouseevent mouseevent) { topleftpoint = mouseevent.getpoint(); redispatchmouseevent(mouseevent); } public void mousedragged(final mouseevent mouseevent) { bottomrightpoint = mouseevent.getpoint(); redispatchmouseevent(mouseevent, topleftpoint != null, false); } public void mousereleased(final mouseevent mouseevent) { bottomrightpoint = mouseevent.getpoint(); redispatchmouseevent(mouseevent, true, true); } public void mousemoved(final mouseevent mouseevent) { redispatchmouseevent(mouseevent); } public void mouseclicked(final mouseevent mouseevent) { redispatchmouseevent(mouseevent); } public void mouseentered(final mouseevent mouseevent) { redispatchmouseevent(mouseevent); } public void mouseexited(final mouseevent mouseevent) { redispatchmouseevent(mouseevent); } private void redispatchmouseevent(final mouseevent mouseevent) { redispatchmouseevent(mouseevent, false, false); } private void redispatchmouseevent(final mouseevent mouseevent, final boolean repaint, final boolean extract) { final point glasspanepoint = mouseevent.getpoint(); final point containerpoint = swingutilities.convertpoint(customglasspane, glasspanepoint, contentpane); if (containerpoint.y >= 0) { final component component = swingutilities.getdeepestcomponentat(contentpane, containerpoint.x, containerpoint.y); if (component != null) { final point componentpoint = swingutilities.convertpoint(customglasspane, glasspanepoint, component); // forward events component under glass pane. component.dispatchevent(new mouseevent(component, mouseevent.getid(), mouseevent.getwhen(), mouseevent.getmodifiers(), componentpoint.x, componentpoint.y, mouseevent.getclickcount(), mouseevent.ispopuptrigger())); } } // update glass pane if requested. if (repaint) { if (extract) { extractselectionfrompdf.handleselection(topleftpoint, bottomrightpoint); topleftpoint = null; bottomrightpoint = null; } customglasspane.setselection(topleftpoint, bottomrightpoint); customglasspane.repaint(); } } }
the glass pane part of code above inspired glasspanedemo
example.
known remaining issues in code above:
- for reason scroll down button of pdf viewer has clicked once before page up/down , arrow up/down keys work.
- currently text extracted seems below selected rectangle.
Comments
Post a Comment