Document conversion workflow

Conversion of multipage documents

If you want to take advantage of multithreading, you can use the interface IPageQueue or its implementation CRangedPageQueue to start the multipage document conversion right at the beginning of the OCR processing.

DCW2
Image 1. Document conversion workflow using CRangedPageQueue

So you can use the processed pages as soon as the output document engine has retrieved them.

Alternatively, you can use the experimental CDocumentProcessing class, which handles the multithreading for you.

Code snippets

The code snippet does not show exception handling to keep it simple.

Handle threads manually

CApplication within a multithreaded document conversion workflow, using the C++ API and std::thread instructions
/**
 * CreateDocument() processes one or more image(s) to create a document.
 * The pages of the document are stored using the default page collection
 */
void CApplication::CreateDocument ()
{
  CDocumentWriter objDocumentWriter = CDocumentWriter::Create ( m_objCIDRS );
  CDefaultPageCollection objPageCollection = CDefaultPageCollection::Create ( m_objCIDRS );
  ...
  // Create the document
  CDocument objDocument = CDocument::Create ( objPageCollection );
  ...
  CImageIO objImageIO = CImageIO::Create ( m_objCIDRS );
  // Get number of pages
  const IDRS_UINT uiPageCount = objImageIO.GetPageCount ( strFilePath );
  const int iProcessingThreadCount = 3;
  std::vector<std::thread> xThreads;
  ...
  // Start OCR in 3 other threads
  for ( int iProcessingThreadIndex = 0; iProcessingThreadIndex < iProcessingThreadCount; iThreadIndex++ )
  {
    xThreads.push_back ( std::thread ([...]()
      {
        for (...)
        {
          // Read from the input file the page at current index and append it to the document
          ReadPage ( objImageIO, objDocument, strFilePath, uiPageIndex, xThreads );
          ...
        }
      }));
    ...
  }
  ...

  //Create the output document in a separate thread
  std::thread outputThread([&]()
    {
      objDocumentWriter.SetOutputParams ( m_objOutputParams );
      objDocumentWriter.Save ( strOutputFilePath, objDocument );
    });
  outputThread.join ();
  for ( auto& objThread : xThreads )
  {
    objThread.join ();
  }
}
CDocument within a multithreaded document conversion workflow, using the .NET API
CIDRS objIdrs = new CIDRS();
// Create the document object and set the blocking mode flag to 'true'.
CDocument objDocument = new CDocument ( objPageCollection, true );

// Start document output in a separate thread
Thread objOutputThread = new Thread(() =>
{
  CDocumentWriter objDocumentWriter = new CDocumentWriter::Create (objIdrs);
  objDocumentWriter.OutputParams = objOutputParams;
  objDocumentWriter.Save ( strOutputPath, objDocument );
});
objOutputThread.Start();

// Start OCR in 3 other threads
const int iProcessingThreadCount = 3;
Parallel.For(...,
  new ParallelOptions {MaxDegreeOfParallelism = iProcessingThreadCount},
  ... =>
{

  // Retrieve the number of the next page to process
  UInt32 uiPageToProcess = (...);

  // Load and process it
  using (CPage objPage = new CPage(...))
  {
    // ...

    // Once the process is complete, add the page to the collection. If the memory threshold is reached,
    // this call may block until a thread becomes available to store the page on disk.
    objDocument.Pages.SetAt ( iPageToProcess, objPage );
  }
});

// When the end of the document is reached, close the page collection
// to complete the creation of the document.
CDefaultPageCollection objDefaultPageCollection = ( CDefaultPageCollection ) objDocument.GetPages();
objDefaultPageCollection.SetReadyToClose();

// Wait for the document output thread completion before exiting
objOutputThread.Join();

With CDocumentProcessing

CApplication within a multithreaded document conversion workflow, using the experimental C++ API
/**
 * CreateDocument() processes one or more image(s) to create a document.
 * The pages of the document are stored using the default page collection
 */
void CApplication::CreateDocument ()
{
  using namespace IDRS::Experimental;

  // Create the objects
  CIDRSExt objIdrsExt = CIDRSExt::Create(4); // using 4 threads in total
  CDocumentProcessing objDocumentProcessing = CDocumentProcessing::Create(objIdrsExt);

  // Set the parameters
  objDocumentProcessing.SetOcrParams(m_objOcrParams);
  objDocumentProcessing.GetOutputs().AddTail(m_objOutputParams);

  // Launch the conversion
  CDocumentProcessingResult objResult = objDocumentProcessing.ProcessDocument(strFilePath, strOutputFilePath);
}
CDocument within a multithreaded document conversion workflow, using the experimental .NET API
using IDRSNET.Experimental;

// Create the objects
CIDRSExt objIdrs = new CIDRSExt(4); // using 4 threads in total
CDocumentProcessing objDocumentProcessing = new CDocumentProcessing ( objIdrs );

// Set the parameters
objDocumentProcessing.OcrParams = ...;
objDocumentProcessing.Outputs.Add(objOutputParams);

// Launch the conversion
CDocumentProcessingResult objResult = objDocumentProcessing.ProcessDocument(
                  (uint pageIndex, out CPageContent pageContent) => { ... },
                  (outputIdentifier) => File.Open
                  );