How to extract text from PDF files in Salesforce

Published on May 5, 2017

While there are many formats to store & retrieve your data digitally, the Portable Document Format (pdf) stands out amidst them thanks to its various features like Security, Reduced size, Password protection & Compatibility.

As PDF format assures more qualities than other formats, people prefer to store most of the files in PDF. But in Salesforce, developers are allowed to extract text from files or attachments only in ‘.txt’ format.

In order to extract data from PDF files/attachments one must override Salesforce’s standard file upload functionality using JavaScript and Visualforce page. However, a Json zip file has to be downloaded for this process.

Extract Text From PDF Files In Salesforce-1

Prerequisites:

Download the pdf.js zip file from https://github.com/xavikarthi/SF-repository

The process is as follows:

Create a static resource with the downloaded js file named as “pdf_upload“
Add a loading image in static resource named as “loader” /// to intimate user about the upload process
Create a Custom_Field__c in CustomObject__c to save the text content
Create a Visualforce Page with below code

<apex:page standardController=”CustomObject__c”>

<apex:form enctype=”multipart/form-data”>

<style>

html, body { width: 100%; height: 100%; overflow-y: hidden; padding: 0; margin: 0; }

body { font: 13px Helvetica,sans-serif; }

body > div { width: 100%; height: 100%; overflow-y: auto; display: inline-block; vertical-align: top; }

iframe { border: none; width: 100%; height: 100%; }

#output { padding: 10px; box-shadow: 0 0 5px #777; border-radius: 5px; margin: 10px; }

#processor { height: 70px; display:none; }

#input { display:none;}

#inprogress { display:none; float:left; padding-left:25px; padding-top:0px; }

.inputfield { float:left; }

</style>

<body>

<!– embed the pdftotext web app as an iframe –>

</div>

</body>

var sFileExtension;

document.getElementById(‘file-input’).addEventListener(‘change’, checkFile, false);

function checkFile(e) {

/// get list of files

var file_list = e.target.files;

/// go through the list of files

for (var i = 0, file; file = file_list[i]; i++) {

var sFileName = file.name;

sFileExtension = sFileName.split(‘.’)[sFileName.split(‘.’).length – 1].toLowerCase();

var iFileSize = file.size;

var iConvert = (file.size / 1048576).toFixed(2);

/// OR together the accepted extensions and NOT it. Then OR the size cond.

/// It’s easier to see this way, but just a suggestion – no requirement.

if (!(sFileExtension === “pdf” ) || iFileSize > 1048576 ) { /// 1 mb

txt = “File type : ” + sFileExtension + “\n\n”;

txt += “Size: ” + iConvert + ” MB \n\n”;

txt += “Please slect pdf with less than 1 MB.\n\n”;

alert(txt);

file =””;

}

var __sfdcSessionId = ‘{!GETSESSIONID()}’;

var ContentVersion ;

function fileExtension() {

if (sFileExtension === “pdf”) {

onReady();

} else {

alert(‘Please select pdf document ‘);

}

function onReady() {

window.addEventListener(“message”, function(event){

updateCandidate(event.data.replace(/\s+/g, ” “))

});

readpdf();

uploadFile();

}

function readpdf() {

var reader = new FileReader();

reader.onload = function(){

processor.contentWindow.postMessage(reader.result, “*”);

}

reader.onprogress = function(data) {

document.getElementById(“inprogress”).style.display = ‘block’;

}

reader.readAsArrayBuffer(document.getElementById(“file-input”).files[0]);

}

function uploadFile() {

var input = document.getElementById(‘file-input’);

var parentId = ‘{!CustomObject__c.Id}’;

var filesToUpload = input.files;

for(var i = 0, f; f = filesToUpload[i]; i++) {

var reader = new FileReader();

// Keep a reference to the File in the FileReader so it can be accessed in callbacks

reader.file = f;

/// creating salesforce file

reader.onload = function(e) {

var NewContentVersion = new sforce.SObject(“ContentVersion”);

NewContentVersion.Title = ‘{!CustomObject__c.Name}’+’`s-Profile’;

NewContentVersion.PathOnClient = ‘/’ + this.file.name;

NewContentVersion.VersionData = (new sforce.Base64Binary(e.target.result)).toString();

NewContentVersion.Origin = ‘H’;

var result = sforce.connection.create([NewContentVersion]);

};

reader.readAsBinaryString(f);

};

}

///Updating custom object field with the text

function updateCandidate(profileContent) {

var CustomObject__c = new sforce.SObject(“CustomObject__c”);

CustomObject__c.ID = ‘{!CustomObject__c.Id}’;

CustomObject__c.Custom_Field__c = profileContent;

var result =sforce.connection.update([CustomObject__c]);

if (result[0].getBoolean(“success”)) {

console.log(“CustomObject with id ” + result[0].id + ” updated”);

alert (‘New Document is Uploaded Successfully’);

window.top.location=’/{!CustomObject__c.id}’;

} else {

alert (‘Document is Not Uploaded’);

}

</script>

</apex:form>

</apex:page>

Add the Visual Force code under Standard page layout section and upload the file against the record.

After adding custom visual force code, your page layout will look like the image below:

Limitations:

This works only for PDF files/attachments type.
The upload speed depends on the file size.
It supports only single file upload.