Extract text from pdf file using javascript
here is a nice example of how to use pdf.js for extracting the text:
http://git.macropus.org/2011/11/pdftotext/example/
of course you have to remove a lot of code for your purpose, but it should do it
How to extract text from a PDF in JavaScript
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
- What you get in each step is a promise. You need to code this way:
.then( function(){...})
to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array
textContent.bidiTexts[]
. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)The input parameter
data
needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function inFileReader
API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts
with textContent.items
.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* @param data ArrayBuffer of the pdf file content
* @param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* @param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
How to correctly extract text from a pdf using pdf.js
Similar to https://stackoverflow.com/a/40494019/1765767 -- collect page promises using Promise.all and don't forget to chain then's:
function gettext(pdfUrl){
var pdf = pdfjsLib.getDocument(pdfUrl);
return pdf.then(function(pdf) { // get all pages text
var maxPages = pdf.pdfInfo.numPages;
var countPromises = []; // collecting all page promises
for (var j = 1; j <= maxPages; j++) {
var page = pdf.getPage(j);
var txt = "";
countPromises.push(page.then(function(page) { // add page promise
var textContent = page.getTextContent();
return textContent.then(function(text){ // return content promise
return text.items.map(function (s) { return s.str; }).join(''); // value page text
});
}));
}
// Wait for all pages and join text
return Promise.all(countPromises).then(function (texts) {
return texts.join('');
});
});
}
// waiting on gettext to finish completion, or error
gettext("https://cdn.mozilla.net/pdfjs/tracemonkey.pdf").then(function (text) {
alert('parse ' + text);
},
function (reason) {
console.error(reason);
});
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
How to extract text from a pdf file in javascript?
PDF.js allows you to load file objects and then parse the document as a text. This example from the official website does exactly that.
How to extract text from PDF?
I'm answering my own question. First I create a regular html input.
<input type='file'/>
I'm using React, so I use onChange
attribute in place of id
.
So, when the user enters with the file, a function is activated and I use the following code to get the file:
const file = event.target.files[0];
file
not has a path, which is used by PDF.JS to get the real file.
Then I use a FileReader
to convert the file int a Array of bits (I guess):
const fileReader = new FileReader();
Then we set a function at fileReader.onload
the function can be found here
fileReader.onload = function() {...}
Finally we do this:
fileReader.readAsArrayBuffer(file);
Important PS: pdf.pdfInfo
must be replaced with pdf
at new PDF.JS versions.
Thanks for helping.
Extra PS: To use pdfjsLib
as PDFJS in React I did this in index.html
file:
window.PDFJS = pdfjsLib
Extract specific portion of text from pdf using Javascript?
You could match Datacover between word boundaries \b
and repeat in a non greedy way 3 times matching any char including a newling [\s\S]*?
until the next occurrence of a dot and space \.
\bDatacover\b(?:[\s\S]*?\. ){3}
Regex demo
To get the data, you could use
event.data.match(regex)
For example:
const regex = /\bDatacover\b(?:[\s\S]*?\. ){3}/g;let event = { data: `testhjgjhg hjg jhg jkgh kjhghjkg76t 76 tguygtf yr 6 rt6 gtyut 67 tuy yoty yutyu tyu yutyuit iyut iuytiyu tuiyt Datacover uytuy tuyt uyt uiytuiyt uytutest.yu tuyt uyt uyt iutiuyt uiy yuitui tuyttest. uiyt uiytuiyt uyt ut uithis is a test. sjhdgfjsa. hgwryuehrgfhrghw fsdfdfsfs sddsfdfs.`};
console.log(event.data.match(regex));
Related Topics
Sort an Array of Object by a Property (With Custom Order, Not Alphabetically)
Shiny App:Disable Downloadbutton
Browser-Independent Way to Detect When Image Has Been Loaded
The .Replace() Method Does Change the String in Place
Understanding Service Worker Scope
Reading JavaScript Variable into Shiny/R on App Load
Are There JavaScript or Ruby Versions of "HTML Tidy"
How Would You Overload the [] Operator in JavaScript
Firebase.Database Is Not a Function
How to Save a Leaflet Map with Drawn Shapes/Points on It in Shiny
How to Include HTML in a Js Rails Response
Ruby on Rails 3.1 - Assets Pipeline - Assets Rendered Twice
How Persistent Is Localstorage
Change Second Select List Based on First Select List Value in Rails
Differencebetween 'Let' and 'Const' Ecmascript 2015 (Es6)