Skip to content

Commit

Permalink
Add flag for request timeout and skip TLS verification
Browse files Browse the repository at this point in the history
  • Loading branch information
RadhiFadlillah committed Apr 16, 2020
1 parent c7ed4a3 commit 536b0e3
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 58 deletions.
67 changes: 54 additions & 13 deletions archiver.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ package obelisk

import (
"context"
"crypto/tls"
"fmt"
"io"
"io/ioutil"
"net/http"
nurl "net/url"
"strings"
"sync"
"time"

"golang.org/x/sync/semaphore"
)
Expand All @@ -23,13 +25,17 @@ var (

// Config is configuration for archival process.
type Config struct {
UserAgent string
EnableLog bool
EnableVerboseLog bool
DisableJS bool
DisableCSS bool
DisableEmbeds bool
DisableMedias bool
UserAgent string
EnableLog bool
EnableVerboseLog bool

DisableJS bool
DisableCSS bool
DisableEmbeds bool
DisableMedias bool

RequestTimeout time.Duration
SkipTLSVerification bool
MaxConcurrentDownload int64
}

Expand All @@ -50,9 +56,10 @@ type archiver struct {
contentTypes map[string]string
dlSemaphore *semaphore.Weighted

config Config
cookies []*http.Cookie
rootURL string
config Config
rootURL string
cookies []*http.Cookie
httpClient *http.Client
}

// Archive starts archival process for the specified request.
Expand Down Expand Up @@ -85,15 +92,26 @@ func Archive(ctx context.Context, req Request, cfg Config) ([]byte, string, erro
rootURL.Query().Del(key)
}

httpClient := &http.Client{
Timeout: cfg.RequestTimeout,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: cfg.SkipTLSVerification,
},
},
Jar: nil,
}

arc := &archiver{
ctx: ctx,
cache: make(map[string][]byte),
contentTypes: make(map[string]string),
dlSemaphore: semaphore.NewWeighted(cfg.MaxConcurrentDownload),

config: cfg,
cookies: req.Cookies,
rootURL: rootURL.String(),
config: cfg,
rootURL: rootURL.String(),
cookies: req.Cookies,
httpClient: httpClient,
}

// If needed download page from source URL
Expand Down Expand Up @@ -124,3 +142,26 @@ func Archive(ctx context.Context, req Request, cfg Config) ([]byte, string, erro

return []byte(result), contentType, nil
}

func (arc *archiver) downloadFile(url string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}

req.Header.Set("User-Agent", arc.config.UserAgent)
if url != arc.rootURL {
req.Header.Set("Referer", arc.rootURL)
}

for _, cookie := range arc.cookies {
req.AddCookie(cookie)
}

resp, err := arc.httpClient.Do(req)
if err != nil {
return nil, err
}

return resp, nil
}
10 changes: 9 additions & 1 deletion cmd/obelisk/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
nurl "net/url"
"os"
fp "path/filepath"
"time"

"github.com/go-shiori/obelisk"
"github.com/sirupsen/logrus"
Expand All @@ -25,6 +26,8 @@ func main() {

cmd.Flags().StringP("input", "i", "", "path to file which contains URLs")
cmd.Flags().StringP("output", "o", "", "path to save archival result")
cmd.Flags().StringP("load-cookies", "c", "", "path to Netscape cookie file")

cmd.Flags().StringP("user-agent", "u", "", "set custom user agent")
cmd.Flags().BoolP("gzip", "z", false, "gzip archival result")
cmd.Flags().BoolP("quiet", "q", false, "disable logging")
Expand All @@ -35,8 +38,9 @@ func main() {
cmd.Flags().Bool("no-embeds", false, "remove embedded elements (e.g iframe)")
cmd.Flags().Bool("no-medias", false, "remove media elements (e.g img, audio)")

cmd.Flags().IntP("timeout", "t", 60, "maximum time (in second) before request timeout")
cmd.Flags().Bool("insecure", false, "skip X.509 (TLS) certificate verification")
cmd.Flags().Int64("max-concurrent-download", 10, "max concurrent download at a time")
cmd.Flags().StringP("load-cookies", "c", "", "path to Netscape cookie file")

// Execute
err := cmd.Execute()
Expand All @@ -62,6 +66,8 @@ func cmdHandler(cmd *cobra.Command, args []string) error {
disableEmbeds, _ := cmd.Flags().GetBool("no-embeds")
disableMedias, _ := cmd.Flags().GetBool("no-medias")

timeout, _ := cmd.Flags().GetInt("timeout")
skipTLSVerification, _ := cmd.Flags().GetBool("insecure")
maxConcurrentDownload, _ := cmd.Flags().GetInt64("max-concurrent-download")

// Create list of URLs
Expand Down Expand Up @@ -114,6 +120,8 @@ func cmdHandler(cmd *cobra.Command, args []string) error {
DisableEmbeds: disableEmbeds,
DisableMedias: disableMedias,

RequestTimeout: time.Duration(timeout) * time.Second,
SkipTLSVerification: skipTLSVerification,
MaxConcurrentDownload: maxConcurrentDownload,
}

Expand Down
44 changes: 0 additions & 44 deletions http-client.go

This file was deleted.

0 comments on commit 536b0e3

Please sign in to comment.