How to create a simple PDF to Word converter using JavaScript

PasteNotes January 05, 2023

Here is an example of how you could extract the text from a PDF file using JavaScript:


// Load the PDF file using an XMLHttpRequest
var xhr = new XMLHttpRequest();
xhr.open('GET', '/path/to/pdf.pdf');
xhr.responseType = 'arraybuffer';

xhr.onload = function(e) {
  // Parse the PDF file into a PDFJS Document object
  var data = new Uint8Array(this.response);
  var pdf = PDFJS.getDocument(data);

  // Render the first page of the PDF
  pdf.then(function(pdf) {
    pdf.getPage(1).then(function(page) {
      // Render the page into a canvas
      var scale = 1.5;
      var viewport = page.getViewport(scale);
      var canvas = document.createElement('canvas');
      var context = canvas.getContext('2d');
      canvas.height = viewport.height;
      canvas.width = viewport.width;
      page.render({ canvasContext: context, viewport: viewport });

      // Extract the text from the canvas
      var textContent = page.getTextContent();
      textContent.then(function(text) {
        var textItems = text.items;
        var finalString = '';

        // Concatenate the string values of the text items
        for (var i = 0; i < textItems.length; i++) {
          var item = textItems[i];
          finalString += item.str + ' ';
        }

        console.log(finalString);
      });
    });
  });
};

xhr.send();

This code loads a PDF file using an XMLHttpRequest, parses it into a PDFJS Document object, renders the first page of the PDF into a canvas, and then extracts the text from the canvas using the getTextContent method. The extracted text is stored as an array of text items, which are concatenated into a single string and logged to the console.

I hope this helps! Let me know if you have any questions.

Post a Comment

0 Comments