Skip to content

Commit 8d0e49c

Browse files
committed
Support Office (XML) files
1 parent 25fa63f commit 8d0e49c

6 files changed

+159
-2
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
[![GoDoc](https://godoc.org/github.com/gen2brain/go-fitz?status.svg)](https://godoc.org/github.com/gen2brain/go-fitz)
44
[![Go Report Card](https://goreportcard.com/badge/github.com/gen2brain/go-fitz?branch=master)](https://goreportcard.com/report/github.com/gen2brain/go-fitz)
55

6-
Go wrapper for [MuPDF](http://mupdf.com/) fitz library that can extract pages from PDF, EPUB and MOBI documents as images, text, html or svg.
6+
Go wrapper for [MuPDF](http://mupdf.com/) fitz library that can extract pages from PDF, EPUB, MOBI, DOCX, XLSX and PPTX documents as IMG, TXT, HTML or SVG.
77

88
### Build tags
99

fitz_content_types.go

+137-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package fitz
22

3-
import "bytes"
3+
import (
4+
"bytes"
5+
"encoding/binary"
6+
)
47

58
// contentType returns document MIME type.
69
func contentType(b []byte) string {
@@ -51,6 +54,12 @@ func contentType(b []byte) string {
5154
return "image/vnd.adobe.photoshop"
5255
case isZIP(b):
5356
switch {
57+
case isDOCX(b):
58+
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
59+
case isXLSX(b):
60+
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
61+
case isPPTX(b):
62+
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
5463
case isEPUB(b):
5564
return "application/epub+zip"
5665
case isXPS(b):
@@ -245,3 +254,130 @@ func isXML(b []byte) bool {
245254
b[4] == 0x3F && b[5] == 0x78 && b[6] == 0x6D && b[7] == 0x6C
246255
}
247256
}
257+
258+
type docType int
259+
260+
const (
261+
typeDocx docType = iota + 1
262+
typeXlsx
263+
typePptx
264+
typeOoxml
265+
)
266+
267+
func isDOCX(buf []byte) bool {
268+
typ, ok := msooxml(buf)
269+
return ok && typ == typeDocx
270+
}
271+
272+
func isXLSX(buf []byte) bool {
273+
typ, ok := msooxml(buf)
274+
return ok && typ == typeXlsx
275+
}
276+
277+
func isPPTX(buf []byte) bool {
278+
typ, ok := msooxml(buf)
279+
return ok && typ == typePptx
280+
}
281+
282+
func msooxml(buf []byte) (typ docType, found bool) {
283+
// make sure the first file is correct
284+
if v, ok := checkMSOoml(buf, 0x1E); ok {
285+
return v, ok
286+
}
287+
288+
if !compareBytes(buf, []byte("[Content_Types].xml"), 0x1E) &&
289+
!compareBytes(buf, []byte("_rels/.rels"), 0x1E) &&
290+
!compareBytes(buf, []byte("docProps"), 0x1E) &&
291+
!compareBytes(buf, []byte("_rels"), 0x1E) {
292+
return
293+
}
294+
295+
// skip to the second local file header
296+
// since some documents include a 520-byte extra field following the file
297+
// header, we need to scan for the next header
298+
startOffset := int(binary.LittleEndian.Uint32(buf[18:22]) + 49)
299+
idx := search(buf, startOffset, 6000)
300+
if idx == -1 {
301+
return
302+
}
303+
304+
// now skip to the *third* local file header; again, we need to scan due to a
305+
// 520-byte extra field following the file header
306+
startOffset += idx + 4 + 26
307+
idx = search(buf, startOffset, 6000)
308+
if idx == -1 {
309+
return
310+
}
311+
312+
// and check the subdirectory name to determine which type of OOXML
313+
// file we have. Correct the mimetype with the registered ones:
314+
// http://technet.microsoft.com/en-us/library/cc179224.aspx
315+
startOffset += idx + 4 + 26
316+
if typ, ok := checkMSOoml(buf, startOffset); ok {
317+
return typ, ok
318+
}
319+
320+
// OpenOffice/Libreoffice orders ZIP entry differently, so check the 4th file
321+
startOffset += 26
322+
idx = search(buf, startOffset, 6000)
323+
if idx == -1 {
324+
return typeOoxml, true
325+
}
326+
327+
startOffset += idx + 4 + 26
328+
if typ, ok := checkMSOoml(buf, startOffset); ok {
329+
return typ, ok
330+
} else {
331+
return typeOoxml, true
332+
}
333+
}
334+
335+
func compareBytes(slice, subSlice []byte, startOffset int) bool {
336+
sl := len(subSlice)
337+
338+
if startOffset+sl > len(slice) {
339+
return false
340+
}
341+
342+
s := slice[startOffset : startOffset+sl]
343+
for i := range s {
344+
if subSlice[i] != s[i] {
345+
return false
346+
}
347+
}
348+
349+
return true
350+
}
351+
352+
func checkMSOoml(buf []byte, offset int) (typ docType, ok bool) {
353+
ok = true
354+
355+
switch {
356+
case compareBytes(buf, []byte("word/"), offset):
357+
typ = typeDocx
358+
case compareBytes(buf, []byte("ppt/"), offset):
359+
typ = typePptx
360+
case compareBytes(buf, []byte("xl/"), offset):
361+
typ = typeXlsx
362+
default:
363+
ok = false
364+
}
365+
366+
return
367+
}
368+
369+
func search(buf []byte, start, rangeNum int) int {
370+
length := len(buf)
371+
end := start + rangeNum
372+
signature := []byte{'P', 'K', 0x03, 0x04}
373+
374+
if end > length {
375+
end = length
376+
}
377+
378+
if start >= end {
379+
return -1
380+
}
381+
382+
return bytes.Index(buf[start:end], signature)
383+
}

fitz_content_types_test.go

+21
Original file line numberDiff line numberDiff line change
@@ -150,3 +150,24 @@ var xps []byte
150150
func TestContentTypeXPS(t *testing.T) {
151151
testContentType("application/oxps", xps, t)
152152
}
153+
154+
//go:embed testdata/test.docx
155+
var docx []byte
156+
157+
func TestContentTypeDOCX(t *testing.T) {
158+
testContentType("application/vnd.openxmlformats-officedocument.wordprocessingml.document", docx, t)
159+
}
160+
161+
//go:embed testdata/test.xlsx
162+
var xlsx []byte
163+
164+
func TestContentTypeXLSX(t *testing.T) {
165+
testContentType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", xlsx, t)
166+
}
167+
168+
//go:embed testdata/test.pptx
169+
var pptx []byte
170+
171+
func TestContentTypePPTX(t *testing.T) {
172+
testContentType("application/vnd.openxmlformats-officedocument.presentationml.presentation", pptx, t)
173+
}

testdata/test.docx

33.6 KB
Binary file not shown.

testdata/test.pptx

404 KB
Binary file not shown.

testdata/test.xlsx

28.7 KB
Binary file not shown.

0 commit comments

Comments
 (0)