PDF Extract in C#, VB.NET

  • Demo
  • C# source
  • VB.Net source

The sample demonstrates how to extract images and text from PDF document.

(NO screenshot)

using System;
using System.Collections.Generic;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Text;
using Spire.Pdf;

namespace Extraction
{
    class Program
    {
        static void Main(string[] args)
        {
            //Create a pdf document.
            PdfDocument doc = new PdfDocument();
            doc.LoadFromFile(@"Sample2.pdf");

            StringBuilder buffer = new StringBuilder();
            IList<Image> images = new List<Image>();

            foreach (PdfPageBase page in doc.Pages)
            {
                buffer.Append(page.ExtractText());
                foreach (Image image in page.ExtractImages())
                {
                    images.Add(image);
                }
            }

            doc.Close();

            //save text
            String fileName = "TextInPdf.txt";
            File.WriteAllText(fileName, buffer.ToString());

            //save image
            int index = 0;
            foreach (Image image in images)
            {
                String imageFileName
                    = String.Format("Image-{0}.png", index++);
                image.Save(imageFileName, ImageFormat.Png);
            }

            //Launching the Text file.
            System.Diagnostics.Process.Start(fileName);
        }

    }
}

Imports System.Collections.Generic
Imports System.Drawing
Imports System.Drawing.Imaging
Imports System.IO
Imports System.Text
Imports Spire.Pdf

Namespace Extraction
    Friend Class Program
        Shared Sub Main(ByVal args() As String)
            'Create a pdf document.
            Dim doc As New PdfDocument()
            doc.LoadFromFile("Sample2.pdf")

            Dim buffer As New StringBuilder()
            Dim images As IList(Of Image) = New List(Of Image)()

            For Each page As PdfPageBase In doc.Pages
                buffer.Append(page.ExtractText())
                For Each image As Image In page.ExtractImages()
                    images.Add(image)
                Next image
            Next page

            doc.Close()

            'save text
            Dim fileName As String = "TextInPdf.txt"
            File.WriteAllText(fileName, buffer.ToString())

            'save image
            Dim index As Integer = 0
            For Each image As Image In images
                Dim imageFileName As String = String.Format("Image-{0}.png", index)
                index += 1
                image.Save(imageFileName, ImageFormat.Png)
            Next image

            'Launching the Text file.
            Process.Start(fileName)
        End Sub

    End Class
End Namespace