summaryrefslogtreecommitdiff
blob: 7a1470e4f56c78264add1758fc31843bf51cd25a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#ifndef ARTIFEX_EXTRACT_DOCUMENT_H
#define ARTIFEX_EXTRACT_DOCUMENT_H

static const double pi = 3.141592653589793;

typedef struct
{
    double x;
    double y;
} point_t;

typedef struct
{
    double  a;
    double  b;
    double  c;
    double  d;
    double  e;
    double  f;
} matrix_t;

double matrix_expansion(matrix_t m);

int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
;
/* Returns zero if first four members of *lhs and *rhs are equal, otherwise
+/-1. */

typedef struct
{
    /* (x,y) before transformation by ctm and trm. */
    double      pre_x;
    double      pre_y;
    
    /* (x,y) after transformation by ctm and trm. */
    double      x;
    double      y;
    
    unsigned    ucs;
    double      adv;
} char_t;
/* A single char in a span.
*/

typedef struct
{
    matrix_t    ctm;
    matrix_t    trm;
    char*       font_name;
    
    /* font size is matrix_expansion(trm). */
    
    struct {
        unsigned font_bold      : 1;
        unsigned font_italic    : 1;
        unsigned wmode          : 1;
    };
    
    char_t*     chars;
    int         chars_num;
} span_t;
/* List of chars that have same font and are usually adjacent. */

char_t* span_char_last(span_t* span);
/* Returns last character in span. */

int span_append_c(extract_alloc_t* alloc, span_t* span, int c);
/* Appends new char_t to an span_t with .ucs=c and all other
fields zeroed. */

const char* span_string(extract_alloc_t* alloc, span_t* span);
/* Returns static string containing info about span_t. */

typedef struct
{
    span_t**    spans;
    int         spans_num;
} line_t;
/* List of spans that are aligned on same line. */

span_t* line_span_first(line_t* line);
/* Returns first span in a line. */

span_t* line_span_last(line_t* line);
/* Returns last span in a line. */

typedef struct
{
    line_t**    lines;
    int         lines_num;
} paragraph_t;
/* List of lines that are aligned and adjacent to each other so as to form a
paragraph. */

typedef struct
{
    char*   type;   /* jpg, png etc. */
    char*   name;   /* Name of image file within docx. */
    char*   id;     /* ID of image within docx. */
    char*   data;
    size_t  data_size;
    
    extract_image_data_free data_free;
    void*                   data_free_handle;
    
} image_t;
/* Information about an image. <type> is as passed to extract_add_image();
<name> and <id> are created to be unique identifiers for use in generated docx
file. */

typedef struct
{
    span_t**    spans;
    int         spans_num;
    
    image_t*    images;
    int         images_num;

    line_t**    lines;
    int         lines_num;
    /* These refer to items in .spans. Initially empty, then set by
    extract_join(). */

    paragraph_t**   paragraphs;
    int             paragraphs_num;
    /* These refer to items in .lines. Initially empty, then set
    by extract_join(). */

} page_t;
/* A page. Contains different representations of the list of spans. */

typedef struct
{
    page_t**    pages;
    int         pages_num;
} document_t;
/* A list of pages. */


typedef struct
{
    image_t*    images;
    int         images_num;
    char**      imagetypes;
    int         imagetypes_num;
} images_t;

int extract_document_join(extract_alloc_t* alloc, document_t* document);

#endif