-
Notifications
You must be signed in to change notification settings - Fork 20
/
process-url.go
105 lines (88 loc) · 2.48 KB
/
process-url.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
package obelisk
import (
"context"
"fmt"
"io"
nurl "net/url"
"strings"
"github.com/pkg/errors"
)
var errSkippedURL = errors.New("skip processing url")
//nolint:gocyclo,unparam
func (arc *Archiver) processURL(ctx context.Context, url string, parentURL string, embedded ...bool) ([]byte, string, error) {
// Parse embedded value
isEmbedded := len(embedded) != 0 && embedded[0]
// Make sure this URL is not empty, data or hash. If yes, just skip it.
url = strings.TrimSpace(url)
if url == "" || strings.HasPrefix(url, "data:") || strings.HasPrefix(url, "#") {
return nil, "", errSkippedURL
}
// Parse URL to make sure it's valid request URL. If not, there might be
// some error while preparing document, so just skip this URL
parsedURL, err := nurl.ParseRequestURI(url)
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
return nil, "", errSkippedURL
}
// Check in cache to see if this URL already processed
arc.RLock()
cache, cacheExist := arc.Cache[url]
arc.RUnlock()
if cacheExist {
arc.logURL(url, parentURL, true)
return cache.Data, cache.ContentType, nil
}
// Download the resource, use semaphore to limit concurrent downloads
arc.logURL(url, parentURL, false)
err = arc.dlSemaphore.Acquire(ctx, 1)
if err != nil {
return nil, "", nil
}
resp, err := arc.downloadFile(url, parentURL)
arc.dlSemaphore.Release(1)
if err != nil {
if arc.SkipResourceURLError {
return nil, "", errSkippedURL
} else {
return nil, "", fmt.Errorf("download failed: %w", err)
}
}
defer resp.Body.Close()
// Get content type
contentType := resp.Header.Get("Content-Type")
contentType = strings.TrimSpace(contentType)
if contentType == "" {
contentType = "text/plain"
}
// Read content of response body. If the downloaded file is HTML
// or CSS it need to be processed again
var bodyContent []byte
switch {
case contentType == "text/html" && isEmbedded:
newHTML, err := arc.processHTML(ctx, resp.Body, parsedURL, false)
if err == nil {
bodyContent = s2b(newHTML)
} else {
return nil, "", err
}
case contentType == "text/css":
newCSS, err := arc.processCSS(ctx, resp.Body, parsedURL)
if err == nil {
bodyContent = s2b(newCSS)
} else {
return nil, "", err
}
default:
bodyContent, err = io.ReadAll(resp.Body)
if err != nil {
return nil, "", err
}
}
// Save data URL to cache
arc.Lock()
arc.Cache[url] = Asset{
Data: bodyContent,
ContentType: contentType,
}
arc.Unlock()
return bodyContent, contentType, nil
}