Alright, here is an update. The experimental text extraction has been on the whole time, and I turned on Force.OCR which has generated some errors. The most notable is the first error it throws after the "No text to extract" which has to do with a zip problem (see server.log output below)
Code: Select all2011-09-27 17:39:36,472 WARN [com.openkm.extractor.CuneiformTextExtractor] IO exception executing command: /usr/local/bin/tesseract
java.util.zip.ZipException: error in opening zip file
at java.util.zip.ZipFile.open(Native Method)
at java.util.zip.ZipFile.<init>(ZipFile.java:127)
at java.util.zip.ZipFile.<init>(ZipFile.java:88)
Now, this is strange, as it indicates that perhaps tesseract wasn't built correctly with all the appropriate libraries. So I double checked that I had all the libraries (including zlib) and I compiled it again. In the tesseract readme, it says to doublecheck the config_auto.h file for a line that says something like #HAVE_ZLIB which isn't there. There is a line that says #HAVE_LIBZ which I think might be an error in the coding. So after much searching, I narrowed it down to a problem in leptonica. During compile, it doesn't recognise that zlib is installed, so it isn't including it in the build, which in turn means that tesseract can't use it, hence the error above.
Unfortunately, this problem won't be fixed until the next build of leptonica, although there is a patch. I've applied the patch and rebuilt leptonica, then tesseract and still get an error. It is copied out of the server.log as follows:
Code: Select all2011-09-28 13:56:48,796 WARN [com.openkm.extractor.PdfTextExtractor] PDF does not contains text layer
2011-09-28 13:56:48,811 WARN [com.openkm.util.ExecutionUtils] Abnormal program termination: 1
2011-09-28 13:56:48,811 WARN [com.openkm.util.ExecutionUtils] STDERR: Usage:/usr/local/bin/tesseract imagename outputbase [-l lang] [configfile [[+|-]varfile]...]
2011-09-28 13:56:48,811 WARN [com.openkm.extractor.CuneiformTextExtractor] IO exception executing command: /usr/local/bin/tesseract
java.util.zip.ZipException: error in opening zip file
at java.util.zip.ZipFile.open(Native Method)
at java.util.zip.ZipFile.<init>(ZipFile.java:127)
at java.util.zip.ZipFile.<init>(ZipFile.java:88)
at com.openkm.util.DocumentUtils.spellChecker(DocumentUtils.java:177)
at com.openkm.extractor.CuneiformTextExtractor.doOcr(CuneiformTextExtractor.java:130)
at com.openkm.extractor.PdfTextExtractor.extractText(PdfTextExtractor.java:92)
at org.apache.jackrabbit.extractor.CompositeTextExtractor.extractText(CompositeTextExtractor.java:90)
at org.apache.jackrabbit.core.query.lucene.JackrabbitTextExtractor.extractText(JackrabbitTextExtractor.java:195)
at com.openkm.extractor.RegisteredExtractors.getText(RegisteredExtractors.java:75)
at com.openkm.extractor.RegisteredExtractors.index(RegisteredExtractors.java:117)
at com.openkm.module.base.BaseDocumentModule.create(BaseDocumentModule.java:161)
at com.openkm.module.direct.DirectDocumentModule.create(DirectDocumentModule.java:199)
at com.openkm.module.direct.DirectDocumentModule.create(DirectDocumentModule.java:98)
at com.openkm.api.OKMDocument.create(OKMDocument.java:71)
at com.openkm.servlet.frontend.FileUploadServlet.doPost(FileUploadServlet.java:176)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:710)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:803)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:290)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at org.jboss.web.tomcat.filters.ReplyHeaderFilter.doFilter(ReplyHeaderFilter.java:96)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:230)
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:175)
at org.jboss.web.tomcat.security.SecurityAssociationValve.invoke(SecurityAssociationValve.java:182)
at org.apache.catalina.authenticator.AuthenticatorBase.invoke(AuthenticatorBase.java:524)
at org.jboss.web.tomcat.security.JaccContextValve.invoke(JaccContextValve.java:84)
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
at org.jboss.web.tomcat.service.jca.CachedConnectionValve.invoke(CachedConnectionValve.java:157)
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:262)
at org.apache.coyote.http11.Http11AprProcessor.process(Http11AprProcessor.java:856)
at org.apache.coyote.http11.Http11AprProtocol$Http11ConnectionHandler.process(Http11AprProtocol.java:566)
at org.apache.tomcat.util.net.AprEndpoint$Worker.run(AprEndpoint.java:1508)
at java.lang.Thread.run(Thread.java:662)
2011-09-28 13:56:48,813 ERROR [org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap] java.lang.IllegalArgumentException: Number of bits must be >= 0
java.lang.IllegalArgumentException: Number of bits must be >= 0
at java.awt.image.ColorModel.<init>(ColorModel.java:353)
at java.awt.image.ComponentColorModel.<init>(ComponentColorModel.java:256)
at org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray.createColorModel(PDDeviceGray.java:91)
at org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap.getRGBImage(PDPixelMap.java:238)
at org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap.write2OutputStream(PDPixelMap.java:285)
at org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage.write2file(PDXObjectImage.java:165)
at com.openkm.extractor.PdfTextExtractor.extractText(PdfTextExtractor.java:91)
at org.apache.jackrabbit.extractor.CompositeTextExtractor.extractText(CompositeTextExtractor.java:90)
at org.apache.jackrabbit.core.query.lucene.JackrabbitTextExtractor.extractText(JackrabbitTextExtractor.java:195)
at com.openkm.extractor.RegisteredExtractors.getText(RegisteredExtractors.java:75)
at com.openkm.extractor.RegisteredExtractors.index(RegisteredExtractors.java:117)
at com.openkm.module.base.BaseDocumentModule.create(BaseDocumentModule.java:161)
at com.openkm.module.direct.DirectDocumentModule.create(DirectDocumentModule.java:199)
at com.openkm.module.direct.DirectDocumentModule.create(DirectDocumentModule.java:98)
at com.openkm.api.OKMDocument.create(OKMDocument.java:71)
at com.openkm.servlet.frontend.FileUploadServlet.doPost(FileUploadServlet.java:176)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:710)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:803)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:290)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at org.jboss.web.tomcat.filters.ReplyHeaderFilter.doFilter(ReplyHeaderFilter.java:96)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:230)
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:175)
at org.jboss.web.tomcat.security.SecurityAssociationValve.invoke(SecurityAssociationValve.java:182)
at org.apache.catalina.authenticator.AuthenticatorBase.invoke(AuthenticatorBase.java:524)
at org.jboss.web.tomcat.security.JaccContextValve.invoke(JaccContextValve.java:84)
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
at org.jboss.web.tomcat.service.jca.CachedConnectionValve.invoke(CachedConnectionValve.java:157)
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:262)
at org.apache.coyote.http11.Http11AprProcessor.process(Http11AprProcessor.java:856)
at org.apache.coyote.http11.Http11AprProtocol$Http11ConnectionHandler.process(Http11AprProtocol.java:566)
at org.apache.tomcat.util.net.AprEndpoint$Worker.run(AprEndpoint.java:1508)
at java.lang.Thread.run(Thread.java:662)