DocWire DocToText - Powered by Silvercoders 5.0.5
A multifaceted, data extraction software development toolkit that converts all sorts of files to plain text and html. Written in C++, this data extraction tool has a parser able to convert PST & OST files along with a brand new API for better file processing. To enhance its utility, DocToText, as a data extraction tool, can be integrated with other data mining and data analytics applications. It comes equipped with a high grade, scriptable and trainable OCR that has LSTM neural networks based character recognition. This document parser is able to extract metadata along with annotations and supports a list of formats that include: DOC, XLS, XLSB, PPT, RTF, ODF (ODT, ODS, ODP), OOXML (DOCX, XLSX, PPTX), iWork (PAGES, NUMBERS, KEYNOTE), ODFXML (FODP, FODS, FODT), PDF, EML, HTML, Outlook (PST, OST), Image (JPG, JPEG, JFIF, BMP, PNM, PNG, TIFF, WEBP) and DICOM (DCM)
doctotext_c_api.h
Go to the documentation of this file.
1/***************************************************************************************************************************************************/
2/* DocToText - A multifaceted, data extraction software development toolkit that converts all sorts of files to plain text and html. */
3/* Written in C++, this data extraction tool has a parser able to convert PST & OST files along with a brand new API for better file processing. */
4/* To enhance its utility, DocToText, as a data extraction tool, can be integrated with other data mining and data analytics applications. */
5/* It comes equipped with a high grade, scriptable and trainable OCR that has LSTM neural networks based character recognition. */
6/* */
7/* This document parser is able to extract metadata along with annotations and supports a list of formats that include: */
8/* DOC, XLS, XLSB, PPT, RTF, ODF (ODT, ODS, ODP), OOXML (DOCX, XLSX, PPTX), iWork (PAGES, NUMBERS, KEYNOTE), ODFXML (FODP, FODS, FODT), */
9/* PDF, EML, HTML, Outlook (PST, OST), Image (JPG, JPEG, JFIF, BMP, PNM, PNG, TIFF, WEBP) and DICOM (DCM) */
10/* */
11/* Copyright (c) SILVERCODERS Ltd */
12/* http://silvercoders.com */
13/* */
14/* Project homepage: */
15/* http://silvercoders.com/en/products/doctotext */
16/* https://www.docwire.io/ */
17/* */
18/* The GNU General Public License version 2 as published by the Free Software Foundation and found in the file COPYING.GPL permits */
19/* the distribution and/or modification of this application. */
20/* */
21/* Please keep in mind that any attempt to circumvent the terms of the GNU General Public License by employing wrappers, pipelines, */
22/* client/server protocols, etc. is illegal. You must purchase a commercial license if your program, which is distributed under a license */
23/* other than the GNU General Public License version 2, directly or indirectly calls any portion of this code. */
24/* Simply stop using the product if you disagree with this viewpoint. */
25/* */
26/* According to the terms of the license provided by SILVERCODERS and included in the file COPYING.COM, licensees in possession of */
27/* a current commercial license for this product may use this file. */
28/* */
29/* This program is provided WITHOUT ANY WARRANTY, not even the implicit warranty of merchantability or fitness for a particular purpose. */
30/* It is supplied in the hope that it will be useful. */
31/***************************************************************************************************************************************************/
32
39#ifndef DOCTOTEXT_C_API_H
40#define DOCTOTEXT_C_API_H
41
42#include <stdbool.h>
43#include "defines.h"
44
45#ifdef __cplusplus
46extern "C"
47{
48#endif
49
50#ifndef DOCTOTEXT_CALL
51 #if defined(_WIN32) && !defined(_WIN64)
52 #define DOCTOTEXT_CALL __cdecl
53 #else
54 #define DOCTOTEXT_CALL
55 #endif
56#endif
57
59typedef struct DocToTextItem DocToTextItem;
64
70
79DOCTOTEXT_CALL doctotext_create_simple_extractor(const char* file_name);
80
86DllExport const char *
88
97DllExport void
98DOCTOTEXT_CALL doctotext_simple_extractor_add_callback_function(DocToTextSimpleExtractor* extractor, void (*callback)(DocToTextInfo*, void* data), void* data);
99
107DllExport DocToTextImporter*
108DOCTOTEXT_CALL doctotext_create_importer_from_file_name(DocToTextParserManager *manager, const char *file_name);
109
117DllExport DocToTextImporter*
118DOCTOTEXT_CALL doctotext_create_importer_from_stream(DocToTextParserManager *manager, FILE *input_stream);
119
125DllExport DocToTextExporter*
126DOCTOTEXT_CALL doctotext_create_plain_text_exporter(FILE *output_stream);
127
133DllExport DocToTextExporter*
134DOCTOTEXT_CALL doctotext_create_html_exporter(FILE *output_stream);
135
143DllExport DocToTextParsingChain*
145
154DllExport DocToTextTransformer*
155DOCTOTEXT_CALL doctotext_create_transfomer(void (*callback)(DocToTextInfo*, void* data), void* data);
156
164DllExport DocToTextParsingChain*
166
173DllExport DocToTextParsingChain*
175
182DllExport DocToTextParsingChain*
184
190DllExport void
191DOCTOTEXT_CALL doctotext_parsing_chain_set_input(DocToTextParsingChain *parsing_chain, FILE* input_stream);
192
199DllExport void
201
208DllExport void
210
217DllExport void
219
226DllExport void
228
234DllExport DocToTextParserManager*
235DOCTOTEXT_CALL doctotext_init_parser_manager(const char* path_to_plugins);
236
243DllExport char**
244doctotext_parser_manager_get_available_formats(DocToTextParserManager* parser_manager, unsigned int *formats_number);
245
254DllExport DocToTextParser*
255DOCTOTEXT_CALL doctotext_parser_manager_get_parser_by_extension(DocToTextParserManager* parser_manager, const char *format);
256
282DllExport void
283DOCTOTEXT_CALL doctotext_parser_add_callback_on_new_node(DocToTextParser* parser, void (*callback)(DocToTextInfo*, void* data), void* data);
284
290DllExport void
292
297DllExport void
299
306DllExport void
308
314DllExport const char*
316
322DllExport const char*
324
331DllExport const char*
332DOCTOTEXT_CALL doctotext_info_get_string_attribute(DocToTextInfo* info, const char* attribute_name);
333
340DllExport unsigned int
341DOCTOTEXT_CALL doctotext_info_get_uint_attribute(DocToTextInfo* info, const char* attribute_name);
342
362DllExport void
363DOCTOTEXT_CALL doctotext_info_set_cancel_parser(DocToTextInfo* info, bool cancel);
364
372DllExport void
373DOCTOTEXT_CALL doctotext_info_set_skip(DocToTextInfo* info, bool skip);
374
387DllExport DocToTextParameters*
389
396DllExport void
397DOCTOTEXT_CALL doctotext_add_int_parameter(DocToTextParameters* parameters, const char* name, int value);
398
405DllExport void
406DOCTOTEXT_CALL doctotext_add_uint_parameter(DocToTextParameters* parameters, const char* name, unsigned int value);
407
414DllExport void
415DOCTOTEXT_CALL doctotext_add_float_parameter(DocToTextParameters* parameters, const char* name, float value);
416
417
424DllExport void
425DOCTOTEXT_CALL doctotext_add_string_parameter(DocToTextParameters* parameters, const char* name, const char* value);
426
448DllExport DocToTextWriter*
450
454DllExport DocToTextWriter*
456
463DllExport void
465
471DllExport void
472DOCTOTEXT_CALL doctotext_writer_write(DocToTextWriter* writer, DocToTextInfo* info, FILE* out_stream);
473
478DllExport void
479DOCTOTEXT_CALL doctotext_writer_write_header(DocToTextWriter* writer, FILE* out_stream);
480
485DllExport void
486DOCTOTEXT_CALL doctotext_writer_write_footer(DocToTextWriter* writer, FILE* out_stream);
487
488#ifdef __cplusplus
489}
490#endif
491
492#endif //DOCTOTEXT_C_API_H
DllExport DocToTextWriter *DOCTOTEXT_CALL doctotext_create_plain_text_writer()
Creates PlainTextWriter. PlainTextWriter writes parsed data from callbacks as plain text.
DllExport DocToTextParsingChain *DOCTOTEXT_CALL doctotext_connect_parsing_chain_to_exporter(DocToTextParsingChain *parsing_chain, DocToTextExporter *exporter)
Adds exporter to the parsing chain.
DllExport const char *DOCTOTEXT_CALL doctotext_info_get_plain_text(DocToTextInfo *info)
Returns parsed text from DocToTextInfo.
struct DocToTextParser DocToTextParser
DllExport DocToTextWriter *DOCTOTEXT_CALL doctotext_create_html_writer()
Creates HtmlWriter. HtmlWriter writes parsed date from callbacks as html. Example of usage:
DllExport void DOCTOTEXT_CALL doctotext_add_uint_parameter(DocToTextParameters *parameters, const char *name, unsigned int value)
Adds unsigned int parameter to parser parameters.
DllExport DocToTextParsingChain *DOCTOTEXT_CALL doctotext_connect_importer_to_exporter(DocToTextImporter *importer, DocToTextExporter *exporter)
Creates connection between importer and exporter and returns DocToTextParsingChain which contains all...
struct DocToTextParserManager DocToTextParserManager
DllExport void DOCTOTEXT_CALL doctotext_simple_extractor_add_callback_function(DocToTextSimpleExtractor *extractor, void(*callback)(DocToTextInfo *, void *data), void *data)
Adds a callback function to be called during parsing. Example of usage:
DllExport unsigned int DOCTOTEXT_CALL doctotext_info_get_uint_attribute(DocToTextInfo *info, const char *attribute_name)
Returns attribute value as a unsigned integer from DocToTextInfo.
DllExport void DOCTOTEXT_CALL doctotext_writer_write_header(DocToTextWriter *writer, FILE *out_stream)
Returns beginning of text from callbacks.
DllExport void DOCTOTEXT_CALL doctotext_parser_add_callback_on_new_node(DocToTextParser *parser, void(*callback)(DocToTextInfo *, void *data), void *data)
Adds new function to execute when new node will be parsed. Node is a part of hierarchical structure....
DllExport void DOCTOTEXT_CALL doctotext_writer_write(DocToTextWriter *writer, DocToTextInfo *info, FILE *out_stream)
Converts text from callback to html format.
DllExport DocToTextParserManager *DOCTOTEXT_CALL doctotext_init_parser_manager(const char *path_to_plugins)
Creates new parser manager with all available parsers.
DllExport DocToTextParser *DOCTOTEXT_CALL doctotext_parser_manager_get_parser_by_extension(DocToTextParserManager *parser_manager, const char *format)
Returns proper parser for given format. The format is defined by file extension. Example of usage:
DllExport void DOCTOTEXT_CALL doctotext_free_parser(DocToTextParser *parser)
Frees parser. Remember not to use function free(). DocToTextParser is allocated using operator new (f...
struct DocToTextInfo DocToTextInfo
struct DocToTextParameters DocToTextParameters
DllExport const char *DOCTOTEXT_CALL doctotext_simple_extractor_get_plain_text(DocToTextSimpleExtractor *extractor)
Gets parsed plain text from a DocToTextSimpleExtractor object.
DllExport void DOCTOTEXT_CALL doctotext_parser_add_parameters(DocToTextParser *parser, DocToTextParameters *parameters)
Adds DocToTextParameters to parser. Every parser pass recursively DocToTextParameters to another pars...
DllExport void DOCTOTEXT_CALL doctotext_writer_write_footer(DocToTextWriter *writer, FILE *out_stream)
Returns end of text from callbacks.
DllExport void DOCTOTEXT_CALL doctotext_info_set_skip(DocToTextInfo *info, bool skip)
Sets skip flag in DocToTextInfo. If skip is true then current node will be skipped....
struct DocToTextWriter DocToTextWriter
DllExport DocToTextExporter *DOCTOTEXT_CALL doctotext_create_html_exporter(FILE *output_stream)
Creates a new DocToTextExporter object. This object is used to export parsed data to output as a html...
DllExport void DOCTOTEXT_CALL doctotext_free_transformer(DocToTextTransformer *transformer)
Frees transformer and all resources allocated by the transformer. Remember not to use function free()...
DllExport void DOCTOTEXT_CALL doctotext_free_writer(DocToTextWriter *writer)
Frees HtmlWriter. DocToTextWriter is allocated using operator new (from C++) and is supposed to be de...
DllExport void DOCTOTEXT_CALL doctotext_add_float_parameter(DocToTextParameters *parameters, const char *name, float value)
Adds float parameter to parser parameters.
DllExport DocToTextExporter *DOCTOTEXT_CALL doctotext_create_plain_text_exporter(FILE *output_stream)
Creates a new DocToTextExporter object. This object is used to export parsed data to output as a plai...
DllExport void DOCTOTEXT_CALL doctotext_info_set_cancel_parser(DocToTextInfo *info, bool cancel)
Sets cancel flag in DocToTextInfo. If cancel is true then parsing chain will be stop....
DllExport void DOCTOTEXT_CALL doctotext_parser_parse(DocToTextParser *parser)
Start parsing loaded data. The data comes from file or from buffer.
struct DocToTextTransformer DocToTextTransformer
struct DocToTextSimpleExtractor DocToTextSimpleExtractor
DllExport void DOCTOTEXT_CALL doctotext_free_parsing_chain(DocToTextParsingChain *parsing_chain)
Frees parsing_chain and all resources allocated by the parsing chain. Remember not to use function fr...
DllExport DocToTextImporter *DOCTOTEXT_CALL doctotext_create_importer_from_file_name(DocToTextParserManager *manager, const char *file_name)
Creates a new DocToTextImporter object. This object is used to import a file and parse it using avail...
DllExport char ** doctotext_parser_manager_get_available_formats(DocToTextParserManager *parser_manager, unsigned int *formats_number)
struct DocToTextExporter DocToTextExporter
DllExport DocToTextImporter *DOCTOTEXT_CALL doctotext_create_importer_from_stream(DocToTextParserManager *manager, FILE *input_stream)
Creates a new DocToTextImporter object. This object is used to import a data from input stream and pa...
DllExport void DOCTOTEXT_CALL doctotext_parsing_chain_set_input(DocToTextParsingChain *parsing_chain, FILE *input_stream)
Adds input stream to the parsing chain. This function starts parsing chain.
DllExport DocToTextTransformer *DOCTOTEXT_CALL doctotext_create_transfomer(void(*callback)(DocToTextInfo *, void *data), void *data)
Creates a new DocToTextTransformer object. This object is used to transform parsed data....
struct DocToTextParsingChain DocToTextParsingChain
DllExport const char *DOCTOTEXT_CALL doctotext_info_get_tag_name(DocToTextInfo *info)
DllExport void DOCTOTEXT_CALL doctotext_add_string_parameter(DocToTextParameters *parameters, const char *name, const char *value)
Adds const char* parameter to parser parameters.
DllExport DocToTextParameters *DOCTOTEXT_CALL doctotext_create_parameter()
Creates new empty DocToTextParameters. In next step we can pass to DocToTextParameters required param...
DllExport void DOCTOTEXT_CALL doctotext_free_exporter(DocToTextExporter *exporter)
Frees exporter and all resources allocated by the exporter. Remember not to use function free()....
DllExport DocToTextParsingChain *DOCTOTEXT_CALL doctotext_connect_parsing_chain_to_transformer(DocToTextParsingChain *parsing_chain, DocToTextTransformer *transformer)
Adds transformer to the parsing chain.
DllExport const char *DOCTOTEXT_CALL doctotext_info_get_string_attribute(DocToTextInfo *info, const char *attribute_name)
Returns attribute value as a string from DocToTextInfo.
DllExport void DOCTOTEXT_CALL doctotext_free_importer(DocToTextImporter *importer)
Frees importer and all resources allocated by the importer. DocToTextImporter is allocated using oper...
struct DocToTextImporter DocToTextImporter
DllExport DocToTextParsingChain *DOCTOTEXT_CALL doctotext_connect_importer_to_transformer(DocToTextImporter *importer, DocToTextTransformer *transformer)
Creates connection between importer and transformer and returns DocToTextParsingChain which contains ...
DllExport DocToTextSimpleExtractor *DOCTOTEXT_CALL doctotext_create_simple_extractor(const char *file_name)
Creates a new DocToTextSimpleExtractor object. Example:
DllExport void DOCTOTEXT_CALL doctotext_add_int_parameter(DocToTextParameters *parameters, const char *name, int value)
Adds int parameter to parser parameters.