|
1 | 1 | package fitz
|
2 | 2 |
|
3 |
| -import "bytes" |
| 3 | +import ( |
| 4 | + "bytes" |
| 5 | + "encoding/binary" |
| 6 | +) |
4 | 7 |
|
5 | 8 | // contentType returns document MIME type.
|
6 | 9 | func contentType(b []byte) string {
|
@@ -51,6 +54,12 @@ func contentType(b []byte) string {
|
51 | 54 | return "image/vnd.adobe.photoshop"
|
52 | 55 | case isZIP(b):
|
53 | 56 | switch {
|
| 57 | + case isDOCX(b): |
| 58 | + return "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
| 59 | + case isXLSX(b): |
| 60 | + return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
| 61 | + case isPPTX(b): |
| 62 | + return "application/vnd.openxmlformats-officedocument.presentationml.presentation" |
54 | 63 | case isEPUB(b):
|
55 | 64 | return "application/epub+zip"
|
56 | 65 | case isXPS(b):
|
@@ -245,3 +254,130 @@ func isXML(b []byte) bool {
|
245 | 254 | b[4] == 0x3F && b[5] == 0x78 && b[6] == 0x6D && b[7] == 0x6C
|
246 | 255 | }
|
247 | 256 | }
|
| 257 | + |
| 258 | +type docType int |
| 259 | + |
| 260 | +const ( |
| 261 | + typeDocx docType = iota + 1 |
| 262 | + typeXlsx |
| 263 | + typePptx |
| 264 | + typeOoxml |
| 265 | +) |
| 266 | + |
| 267 | +func isDOCX(buf []byte) bool { |
| 268 | + typ, ok := msooxml(buf) |
| 269 | + return ok && typ == typeDocx |
| 270 | +} |
| 271 | + |
| 272 | +func isXLSX(buf []byte) bool { |
| 273 | + typ, ok := msooxml(buf) |
| 274 | + return ok && typ == typeXlsx |
| 275 | +} |
| 276 | + |
| 277 | +func isPPTX(buf []byte) bool { |
| 278 | + typ, ok := msooxml(buf) |
| 279 | + return ok && typ == typePptx |
| 280 | +} |
| 281 | + |
| 282 | +func msooxml(buf []byte) (typ docType, found bool) { |
| 283 | + // make sure the first file is correct |
| 284 | + if v, ok := checkMSOoml(buf, 0x1E); ok { |
| 285 | + return v, ok |
| 286 | + } |
| 287 | + |
| 288 | + if !compareBytes(buf, []byte("[Content_Types].xml"), 0x1E) && |
| 289 | + !compareBytes(buf, []byte("_rels/.rels"), 0x1E) && |
| 290 | + !compareBytes(buf, []byte("docProps"), 0x1E) && |
| 291 | + !compareBytes(buf, []byte("_rels"), 0x1E) { |
| 292 | + return |
| 293 | + } |
| 294 | + |
| 295 | + // skip to the second local file header |
| 296 | + // since some documents include a 520-byte extra field following the file |
| 297 | + // header, we need to scan for the next header |
| 298 | + startOffset := int(binary.LittleEndian.Uint32(buf[18:22]) + 49) |
| 299 | + idx := search(buf, startOffset, 6000) |
| 300 | + if idx == -1 { |
| 301 | + return |
| 302 | + } |
| 303 | + |
| 304 | + // now skip to the *third* local file header; again, we need to scan due to a |
| 305 | + // 520-byte extra field following the file header |
| 306 | + startOffset += idx + 4 + 26 |
| 307 | + idx = search(buf, startOffset, 6000) |
| 308 | + if idx == -1 { |
| 309 | + return |
| 310 | + } |
| 311 | + |
| 312 | + // and check the subdirectory name to determine which type of OOXML |
| 313 | + // file we have. Correct the mimetype with the registered ones: |
| 314 | + // http://technet.microsoft.com/en-us/library/cc179224.aspx |
| 315 | + startOffset += idx + 4 + 26 |
| 316 | + if typ, ok := checkMSOoml(buf, startOffset); ok { |
| 317 | + return typ, ok |
| 318 | + } |
| 319 | + |
| 320 | + // OpenOffice/Libreoffice orders ZIP entry differently, so check the 4th file |
| 321 | + startOffset += 26 |
| 322 | + idx = search(buf, startOffset, 6000) |
| 323 | + if idx == -1 { |
| 324 | + return typeOoxml, true |
| 325 | + } |
| 326 | + |
| 327 | + startOffset += idx + 4 + 26 |
| 328 | + if typ, ok := checkMSOoml(buf, startOffset); ok { |
| 329 | + return typ, ok |
| 330 | + } else { |
| 331 | + return typeOoxml, true |
| 332 | + } |
| 333 | +} |
| 334 | + |
| 335 | +func compareBytes(slice, subSlice []byte, startOffset int) bool { |
| 336 | + sl := len(subSlice) |
| 337 | + |
| 338 | + if startOffset+sl > len(slice) { |
| 339 | + return false |
| 340 | + } |
| 341 | + |
| 342 | + s := slice[startOffset : startOffset+sl] |
| 343 | + for i := range s { |
| 344 | + if subSlice[i] != s[i] { |
| 345 | + return false |
| 346 | + } |
| 347 | + } |
| 348 | + |
| 349 | + return true |
| 350 | +} |
| 351 | + |
| 352 | +func checkMSOoml(buf []byte, offset int) (typ docType, ok bool) { |
| 353 | + ok = true |
| 354 | + |
| 355 | + switch { |
| 356 | + case compareBytes(buf, []byte("word/"), offset): |
| 357 | + typ = typeDocx |
| 358 | + case compareBytes(buf, []byte("ppt/"), offset): |
| 359 | + typ = typePptx |
| 360 | + case compareBytes(buf, []byte("xl/"), offset): |
| 361 | + typ = typeXlsx |
| 362 | + default: |
| 363 | + ok = false |
| 364 | + } |
| 365 | + |
| 366 | + return |
| 367 | +} |
| 368 | + |
| 369 | +func search(buf []byte, start, rangeNum int) int { |
| 370 | + length := len(buf) |
| 371 | + end := start + rangeNum |
| 372 | + signature := []byte{'P', 'K', 0x03, 0x04} |
| 373 | + |
| 374 | + if end > length { |
| 375 | + end = length |
| 376 | + } |
| 377 | + |
| 378 | + if start >= end { |
| 379 | + return -1 |
| 380 | + } |
| 381 | + |
| 382 | + return bytes.Index(buf[start:end], signature) |
| 383 | +} |
0 commit comments