-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement Stream Decompression for tar
Implement stream decompression for tar files. This mean that -x becomes smart enough to handle lzw, bzip2, gzip, and xz (common tar compression) formats automatically. This will remove a shart edge on pget and handle cases of compressed tar files elegantly.
- Loading branch information
1 parent
8d7cd21
commit b38cb8e
Showing
7 changed files
with
317 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
package extract | ||
|
||
import ( | ||
"compress/bzip2" | ||
"compress/gzip" | ||
"compress/lzw" | ||
"encoding/binary" | ||
"io" | ||
|
||
"github.com/pierrec/lz4" | ||
"github.com/ulikunitz/xz" | ||
) | ||
|
||
const ( | ||
peekSize = 8 | ||
|
||
gzipMagic = 0x1F8B | ||
bzipMagic = 0x425A | ||
xzMagic = 0xFD377A585A00 | ||
lzwMagic = 0x1F9D | ||
lz4Magic = 0x184D2204 | ||
) | ||
|
||
var _ decompressor = gzipDecompressor{} | ||
var _ decompressor = bzip2Decompressor{} | ||
var _ decompressor = xzDecompressor{} | ||
var _ decompressor = lzwDecompressor{} | ||
var _ decompressor = lz4Decompressor{} | ||
var _ decompressor = noOpDecompressor{} | ||
|
||
// decompressor represents different compression formats. | ||
type decompressor interface { | ||
decompress(r io.Reader) (io.Reader, error) | ||
} | ||
|
||
// detectFormat returns the appropriate extractor according to the magic number. | ||
func detectFormat(input []byte) decompressor { | ||
inputSize := len(input) | ||
|
||
if inputSize < 2 { | ||
return noOpDecompressor{} | ||
} | ||
// pad to 8 bytes | ||
if inputSize < 8 { | ||
input = append(input, make([]byte, peekSize-inputSize)...) | ||
} | ||
|
||
magic16 := binary.BigEndian.Uint16(input) | ||
magic32 := binary.BigEndian.Uint32(input) | ||
// We need to pre-pend the padding since we're reading into something bigendian and exceeding the | ||
// 48bits size of the magic number bytes. The 16 and 32 bit magic numbers are complete bytes and | ||
// therefore do not need any padding. | ||
magic48 := binary.BigEndian.Uint64(append(make([]byte, 2), input[0:6]...)) | ||
|
||
switch true { | ||
case magic16 == gzipMagic: | ||
return gzipDecompressor{} | ||
case magic16 == bzipMagic: | ||
return bzip2Decompressor{} | ||
case magic16 == lzwMagic: | ||
compressionByte := input[2] | ||
// litWidth is guaranteed to be at least 9 per specification, the high order 3 bits of byte[2] are the litWidth | ||
// the low order 5 bits are only used by non-unix implementations, we are going to ignore them. | ||
litWidth := int(compressionByte>>5) + 9 | ||
return lzwDecompressor{ | ||
order: lzw.MSB, | ||
litWidth: litWidth, | ||
} | ||
case magic32 == lz4Magic: | ||
return lz4Decompressor{} | ||
case magic48 == xzMagic: | ||
return xzDecompressor{} | ||
default: | ||
return noOpDecompressor{} | ||
} | ||
} | ||
|
||
type gzipDecompressor struct{} | ||
|
||
func (d gzipDecompressor) decompress(r io.Reader) (io.Reader, error) { | ||
return gzip.NewReader(r) | ||
} | ||
|
||
type bzip2Decompressor struct{} | ||
|
||
func (d bzip2Decompressor) decompress(r io.Reader) (io.Reader, error) { | ||
return bzip2.NewReader(r), nil | ||
} | ||
|
||
type xzDecompressor struct{} | ||
|
||
func (d xzDecompressor) decompress(r io.Reader) (io.Reader, error) { | ||
return xz.NewReader(r) | ||
} | ||
|
||
type lzwDecompressor struct { | ||
litWidth int | ||
order lzw.Order | ||
} | ||
|
||
func (d lzwDecompressor) decompress(r io.Reader) (io.Reader, error) { | ||
return lzw.NewReader(r, d.order, d.litWidth), nil | ||
} | ||
|
||
type lz4Decompressor struct{} | ||
|
||
func (d lz4Decompressor) decompress(r io.Reader) (io.Reader, error) { | ||
return lz4.NewReader(r), nil | ||
} | ||
|
||
type noOpDecompressor struct{} | ||
|
||
func (d noOpDecompressor) decompress(r io.Reader) (io.Reader, error) { | ||
return r, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
package extract | ||
|
||
import ( | ||
"fmt" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestDetectFormat(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
input []byte | ||
expectType string | ||
}{ | ||
{ | ||
name: "GZIP", | ||
input: []byte{0x1f, 0x8b}, | ||
expectType: "extract.gzipDecompressor", | ||
}, | ||
{ | ||
name: "BZIP2", | ||
input: []byte{0x42, 0x5a}, | ||
expectType: "extract.bzip2Decompressor", | ||
}, | ||
{ | ||
name: "XZ", | ||
input: []byte{0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00}, | ||
expectType: "extract.xzDecompressor", | ||
}, | ||
{ | ||
name: "Less than 2 bytes", | ||
input: []byte{0x1f}, | ||
expectType: "extract.noOpDecompressor", | ||
}, | ||
{ | ||
name: "UNKNOWN", | ||
input: []byte{0xde, 0xad}, | ||
expectType: "extract.noOpDecompressor", | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
result := detectFormat(tt.input) | ||
assert.Equal(t, tt.expectType, stringFromInterface(result)) | ||
}) | ||
} | ||
} | ||
|
||
func stringFromInterface(i interface{}) string { | ||
if i == nil { | ||
return "" | ||
} | ||
return fmt.Sprintf("%T", i) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package extract | ||
|
||
import ( | ||
"bytes" | ||
"errors" | ||
"io" | ||
) | ||
|
||
type readPeeker interface { | ||
io.Reader | ||
Peek(int) ([]byte, error) | ||
} | ||
|
||
var _ io.Reader = &peekReader{} | ||
var _ readPeeker = &peekReader{} | ||
|
||
type peekReader struct { | ||
reader io.Reader | ||
buffer *bytes.Buffer | ||
} | ||
|
||
func (p *peekReader) Read(b []byte) (int, error) { | ||
if p.buffer != nil { | ||
if p.buffer.Len() > 0 { | ||
n, err := p.buffer.Read(b) | ||
if errors.Is(err, io.EOF) { | ||
err = nil | ||
} | ||
return n, err | ||
} | ||
} | ||
return p.reader.Read(b) | ||
} | ||
|
||
func (p *peekReader) Peek(n int) ([]byte, error) { | ||
if r, ok := p.reader.(readPeeker); ok { | ||
return r.Peek(n) | ||
} | ||
return p.peek(n) | ||
} | ||
|
||
func (p *peekReader) peek(n int) ([]byte, error) { | ||
if p.buffer == nil { | ||
p.buffer = bytes.NewBuffer(make([]byte, 0, n)) | ||
} | ||
// Read the next n bytes from the reader | ||
_, err := io.CopyN(p.buffer, p.reader, int64(n)) | ||
if err != nil { | ||
return p.buffer.Bytes(), err | ||
} | ||
return p.buffer.Bytes(), nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
package extract | ||
|
||
import ( | ||
"strings" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestPeekReader_Read(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
readerContents string | ||
wantBytesPeek int | ||
wantBytesRead int | ||
wantErr bool | ||
}{ | ||
{ | ||
name: "read from buffer only", | ||
readerContents: "abc123", | ||
wantBytesPeek: 6, | ||
wantBytesRead: 6, | ||
wantErr: false, | ||
}, | ||
{ | ||
name: "read from reader only", | ||
readerContents: "abc123", | ||
wantBytesRead: 3, | ||
wantErr: false, | ||
}, | ||
{ | ||
name: "read from both buffer and reader", | ||
readerContents: "abc123", | ||
wantBytesPeek: 3, | ||
wantBytesRead: 6, | ||
wantErr: false, | ||
}, | ||
{ | ||
name: "read empty reader and buffer", | ||
readerContents: "", | ||
wantBytesRead: 0, | ||
wantErr: true, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
reader := strings.NewReader(tt.readerContents) | ||
p := &peekReader{reader: reader} | ||
if tt.wantBytesPeek > 0 { | ||
peekBytes, err := p.Peek(tt.wantBytesPeek) | ||
assert.NoError(t, err) | ||
assert.Equal(t, tt.readerContents[0:tt.wantBytesPeek], string(peekBytes)) | ||
} | ||
|
||
var totalBytesRead int | ||
var err error | ||
readBytes := make([]byte, tt.wantBytesRead) | ||
for totalBytesRead < tt.wantBytesRead && err == nil { | ||
bytesRead, err := p.Read(readBytes[totalBytesRead:]) | ||
assert.NoError(t, err) | ||
totalBytesRead += bytesRead | ||
} | ||
assert.Equal(t, tt.wantBytesRead, totalBytesRead) | ||
assert.Equal(t, tt.readerContents[0:tt.wantBytesRead], string(readBytes)) | ||
|
||
if tt.wantErr { | ||
assert.Error(t, err) | ||
} else { | ||
assert.NoError(t, err) | ||
} | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters