DocWire DocToText - Powered by Silvercoders 5.0.5
A multifaceted, data extraction software development toolkit that converts all sorts of files to plain text and html. Written in C++, this data extraction tool has a parser able to convert PST & OST files along with a brand new API for better file processing. To enhance its utility, DocToText, as a data extraction tool, can be integrated with other data mining and data analytics applications. It comes equipped with a high grade, scriptable and trainable OCR that has LSTM neural networks based character recognition. This document parser is able to extract metadata along with annotations and supports a list of formats that include: DOC, XLS, XLSB, PPT, RTF, ODF (ODT, ODS, ODP), OOXML (DOCX, XLSX, PPTX), iWork (PAGES, NUMBERS, KEYNOTE), ODFXML (FODP, FODS, FODT), PDF, EML, HTML, Outlook (PST, OST), Image (JPG, JPEG, JFIF, BMP, PNM, PNG, TIFF, WEBP) and DICOM (DCM)
parser_wrapper.h
1/***************************************************************************************************************************************************/
2/* DocToText - A multifaceted, data extraction software development toolkit that converts all sorts of files to plain text and html. */
3/* Written in C++, this data extraction tool has a parser able to convert PST & OST files along with a brand new API for better file processing. */
4/* To enhance its utility, DocToText, as a data extraction tool, can be integrated with other data mining and data analytics applications. */
5/* It comes equipped with a high grade, scriptable and trainable OCR that has LSTM neural networks based character recognition. */
6/* */
7/* This document parser is able to extract metadata along with annotations and supports a list of formats that include: */
8/* DOC, XLS, XLSB, PPT, RTF, ODF (ODT, ODS, ODP), OOXML (DOCX, XLSX, PPTX), iWork (PAGES, NUMBERS, KEYNOTE), ODFXML (FODP, FODS, FODT), */
9/* PDF, EML, HTML, Outlook (PST, OST), Image (JPG, JPEG, JFIF, BMP, PNM, PNG, TIFF, WEBP) and DICOM (DCM) */
10/* */
11/* Copyright (c) SILVERCODERS Ltd */
12/* http://silvercoders.com */
13/* */
14/* Project homepage: */
15/* http://silvercoders.com/en/products/doctotext */
16/* https://www.docwire.io/ */
17/* */
18/* The GNU General Public License version 2 as published by the Free Software Foundation and found in the file COPYING.GPL permits */
19/* the distribution and/or modification of this application. */
20/* */
21/* Please keep in mind that any attempt to circumvent the terms of the GNU General Public License by employing wrappers, pipelines, */
22/* client/server protocols, etc. is illegal. You must purchase a commercial license if your program, which is distributed under a license */
23/* other than the GNU General Public License version 2, directly or indirectly calls any portion of this code. */
24/* Simply stop using the product if you disagree with this viewpoint. */
25/* */
26/* According to the terms of the license provided by SILVERCODERS and included in the file COPYING.COM, licensees in possession of */
27/* a current commercial license for this product may use this file. */
28/* */
29/* This program is provided WITHOUT ANY WARRANTY, not even the implicit warranty of merchantability or fitness for a particular purpose. */
30/* It is supplied in the hope that it will be useful. */
31/***************************************************************************************************************************************************/
32
33#ifndef PARSERWRAPPER_H
34#define PARSERWRAPPER_H
35
36#include <iostream>
37
38#include "metadata.h"
39#include "formatting_style.h"
40#include "parser.h"
41#include "parser_builder.h"
42#include "defines.h"
43
44namespace doctotext
45{
46
47template<typename ParserType>
48class DllExport ParserWrapper : public doctotext::Parser
49{
50public:
51 explicit ParserWrapper(const std::string& file_name, const std::shared_ptr<doctotext::ParserManager> &inParserManager = nullptr)
52 : Parser(inParserManager),
53 m_parser(ParserType(file_name))
54 {}
55
56 ParserWrapper(const char* buffer, size_t size, const std::shared_ptr<doctotext::ParserManager> &inParserManager = nullptr)
57 : Parser(inParserManager) ,
58 m_parser(ParserType(buffer, size))
59 {}
60
61 void parse() const override
62 {
63 sendTag(doctotext::StandardTag::TAG_TEXT, m_parser.plainText(getFormattingStyle()));
64 sendTag(doctotext::StandardTag::TAG_METADATA, "", m_parser.metaData().getFieldsAsAny());
65 }
66
67 Parser &withParameters(const ParserParameters &parameters) override
68 {
69 doctotext::Parser::withParameters(parameters);
70 m_parser.setVerboseLogging(isVerboseLogging());
71 m_parser.setLogStream(getLogOutStream());
72 return *this;
73 }
74
75 void setParserManager(const std::shared_ptr<doctotext::ParserManager> &inParserManager)
76 {
77 m_parser_manager = inParserManager;
78 }
79
80private:
81 mutable ParserType m_parser;
82};
83
84template<typename ParserType>
86{
87public:
88 static std::unique_ptr<doctotext::Parser>
89 create(const std::string &inFileName, const std::shared_ptr<doctotext::ParserManager> &parserManager)
90 {
91 return std::make_unique<ParserWrapper<ParserType>>(inFileName, parserManager);
92 }
93
94 static std::unique_ptr<doctotext::Parser>
95 create(const char* buffer, size_t size, const std::shared_ptr<doctotext::ParserManager> &parserManager)
96 {
97 return std::make_unique<ParserWrapper<ParserType>>(buffer, size, parserManager);
98 }
99};
100
101template<typename ParserType>
102class DllExport parser_creator
103{
104public:
105 static std::unique_ptr<doctotext::Parser>
106 create(const std::string &inFileName, const std::shared_ptr<doctotext::ParserManager> &parserManager)
107 {
108 return std::make_unique<ParserType>(inFileName, parserManager);
109 }
110
111 static std::unique_ptr<doctotext::Parser>
112 create(const char* buffer, size_t size, const std::shared_ptr<doctotext::ParserManager> &parserManager)
113 {
114 return std::make_unique<ParserType>(buffer, size, parserManager);
115 }
116};
117
122template<typename ParserCreator>
124{
125public:
127 : m_parser_creator(ParserCreator()),
128 m_log_stream(&std::cerr),
129 m_verbose(false)
130 {
131
132 }
133
134 std::unique_ptr<doctotext::Parser>
135 build(const std::string &inFileName) const override
136 {
137 auto parser = m_parser_creator.create(inFileName, m_parser_manager);
138 for (auto &callback : m_callbacks)
139 {
140 parser->addOnNewNodeCallback(callback);
141 }
142 parser->withParameters(m_parameters);
143 return parser;
144 }
145
146 std::unique_ptr<doctotext::Parser>
147 build(const char* buffer, size_t size) const override
148 {
149 auto parser = m_parser_creator.create(buffer, size, m_parser_manager);
150 for (auto &callback : m_callbacks)
151 {
152 parser->addOnNewNodeCallback(callback);
153 }
154 parser->withParameters(m_parameters);
155 return parser;
156 }
157
158
160 withLogStream(std::ostream* log_stream) override
161 {
162 m_log_stream = log_stream;
163 return *this;
164 }
165
167 withVerboseLogging(bool verbose) override
168 {
169 m_verbose = verbose;
170 return *this;
171 }
172
174 withOnNewNodeCallbacks(const std::vector<doctotext::NewNodeCallback> &callbacks) override
175 {
176 m_callbacks = callbacks;
177 return *this;
178 }
179
181 withParserManager(const std::shared_ptr<doctotext::ParserManager> &inParserManager) override
182 {
183 m_parser_manager = inParserManager;
184 return *this;
185 }
186
188 withParameters(const ParserParameters &inParameter) override
189 {
190 m_parameters += inParameter;
191 return *this;
192 }
193
194private:
195 ParserCreator m_parser_creator;
196 std::ostream *m_log_stream;
197 bool m_verbose;
198 std::vector<doctotext::NewNodeCallback> m_callbacks;
199 std::shared_ptr<doctotext::ParserManager> m_parser_manager;
200 ParserParameters m_parameters;
201};
202} // namespace doctotext
203#endif //PARSERWRAPPER_H
Provides the basic mechanism to build any parser.
doctotext::ParserBuilder & withParameters(const ParserParameters &inParameter) override
Sets parser parameters.
doctotext::ParserBuilder & withVerboseLogging(bool verbose) override
Turns on/off verbose logging.
doctotext::ParserBuilder & withLogStream(std::ostream *log_stream) override
Sets log stream for parser.
std::unique_ptr< doctotext::Parser > build(const std::string &inFileName) const override
Builds new parser object.
std::unique_ptr< doctotext::Parser > build(const char *buffer, size_t size) const override
Builds new parser object.
Abstract class for all parsers.
Definition: parser.h:130
Stores list of parsers parameters. Every parser can query ParserParameter for a specific parameter....
void parse() const override
Executes text parsing.
static const std::string TAG_TEXT
Tag for text.
Definition: parser.h:71
static const std::string TAG_METADATA
Tag for metadata.
Definition: parser.h:91