Source file - Perkeep

     1	/*
     2	Copyright 2011 The Perkeep Authors
     3	
     4	Licensed under the Apache License, Version 2.0 (the "License");
     5	you may not use this file except in compliance with the License.
     6	You may obtain a copy of the License at
     7	
     8	     http://www.apache.org/licenses/LICENSE-2.0
     9	
    10	Unless required by applicable law or agreed to in writing, software
    11	distributed under the License is distributed on an "AS IS" BASIS,
    12	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13	See the License for the specific language governing permissions and
    14	limitations under the License.
    15	*/
    16	
    17	package server
    18	
    19	import (
    20		"archive/zip"
    21		"bytes"
    22		"context"
    23		"errors"
    24		"fmt"
    25		"io"
    26		"log"
    27		"net/http"
    28		"os"
    29		"path/filepath"
    30		"regexp"
    31		"strings"
    32		"time"
    33		"unicode/utf8"
    34	
    35		"go4.org/readerutil"
    36		"perkeep.org/internal/httputil"
    37		"perkeep.org/internal/magic"
    38		"perkeep.org/pkg/blob"
    39		"perkeep.org/pkg/blobserver"
    40		"perkeep.org/pkg/cacher"
    41		"perkeep.org/pkg/schema"
    42		"perkeep.org/pkg/search"
    43	)
    44	
    45	const (
    46		oneYear            = 365 * 86400 * time.Second
    47		downloadTimeLayout = "20060102150405"
    48	)
    49	
    50	var (
    51		debugPack = strings.Contains(os.Getenv("CAMLI_DEBUG_X"), "packserve")
    52	
    53		// Download URL suffix:
    54		//  $1: blobref (checked in download handler)
    55		//  $2: TODO. optional "/filename" to be sent as recommended download name,
    56		//    if sane looking
    57		downloadPattern = regexp.MustCompile(`^download/([^/]+)(/.*)?$`)
    58	)
    59	
    60	type DownloadHandler struct {
    61		Fetcher blob.Fetcher
    62	
    63		// Search is optional. If present, it's used to map a fileref
    64		// to a wholeref, if the Fetcher is of a type that knows how
    65		// to get at a wholeref more efficiently. (e.g. blobpacked)
    66		Search *search.Handler
    67	
    68		ForceMIME   string // optional
    69		forceInline bool   // to force Content-Disposition to inline, when it was not set in the request
    70	
    71		// pathByRef maps a file Ref to the path of the file, relative to its ancestor
    72		// directory which was requested for download. It is populated by checkFiles, which
    73		// only runs if Fetcher is a caching Fetcher.
    74		pathByRef map[blob.Ref]string
    75	
    76		// r is the incoming http request. it is stored in the DownloadHandler so we
    77		// don't have to clutter all the func signatures to pass it all the way down to
    78		// fileInfoPacked.
    79		r *http.Request
    80	}
    81	
    82	type fileInfo struct {
    83		mime     string
    84		name     string
    85		size     int64
    86		modtime  time.Time
    87		mode     os.FileMode
    88		rs       io.ReadSeeker
    89		close    func() error // release the rs
    90		whyNot   string       // for testing, why fileInfoPacked failed.
    91		isDir    bool
    92		children []blob.Ref // directory entries, if we're a dir.
    93	}
    94	
    95	var errNotDir = errors.New("not a directory")
    96	
    97	// dirInfo checks whether maybeDir is a directory schema, and if so returns the
    98	// corresponding fileInfo. If dir is another kind of (valid) file schema, errNotDir
    99	// is returned.
   100	func (dh *DownloadHandler) dirInfo(ctx context.Context, dir blob.Ref) (fi fileInfo, err error) {
   101		rc, _, err := dh.Fetcher.Fetch(ctx, dir)
   102		if err != nil {
   103			return fi, fmt.Errorf("could not fetch %v: %v", dir, err)
   104		}
   105		b, err := schema.BlobFromReader(dir, rc)
   106		rc.Close()
   107		if err != nil {
   108			return fi, fmt.Errorf("could not read %v as blob: %v", dir, err)
   109		}
   110		tp := b.Type()
   111		if tp != "directory" {
   112			return fi, errNotDir
   113		}
   114		dr, err := schema.NewDirReader(ctx, dh.Fetcher, dir)
   115		if err != nil {
   116			return fi, fmt.Errorf("could not open %v as directory: %v", dir, err)
   117		}
   118		children, err := dr.StaticSet(ctx)
   119		if err != nil {
   120			return fi, fmt.Errorf("could not get dir entries of %v: %v", dir, err)
   121		}
   122		return fileInfo{
   123			isDir:    true,
   124			name:     b.FileName(),
   125			modtime:  b.ModTime(),
   126			children: children,
   127		}, nil
   128	}
   129	
   130	func (dh *DownloadHandler) fileInfo(ctx context.Context, file blob.Ref) (fi fileInfo, packed bool, err error) {
   131		// Need to get the type first, because we can't use NewFileReader on a non-regular file.
   132		// TODO(mpl): should we let NewFileReader be ok with non-regular files? and fail later when e.g. trying to read?
   133		rc, _, err := dh.Fetcher.Fetch(ctx, file)
   134		if err != nil {
   135			return fi, false, fmt.Errorf("could not fetch %v: %v", file, err)
   136		}
   137		b, err := schema.BlobFromReader(file, rc)
   138		rc.Close()
   139		if err != nil {
   140			return fi, false, fmt.Errorf("could not read %v as blob: %v", file, err)
   141		}
   142		tp := b.Type()
   143		if tp != schema.TypeFile {
   144			// for non-regular files
   145			var contents string
   146			if tp == schema.TypeSymlink {
   147				sf, _ := b.AsStaticFile()
   148				sl, _ := sf.AsStaticSymlink()
   149				contents = sl.SymlinkTargetString()
   150			}
   151			size := int64(len(contents))
   152			// TODO(mpl): make sure that works on windows too
   153			rd := strings.NewReader(contents)
   154			fi = fileInfo{
   155				size:    size,
   156				modtime: b.ModTime(),
   157				name:    b.FileName(),
   158				mode:    b.FileMode(),
   159				rs:      readerutil.NewFakeSeeker(rd, size),
   160				close:   io.NopCloser(rd).Close,
   161			}
   162			return fi, false, nil
   163		}
   164	
   165		// Fast path for blobpacked.
   166		fi, ok := fileInfoPacked(ctx, dh.Search, dh.Fetcher, dh.r, file)
   167		if debugPack {
   168			log.Printf("download.go: fileInfoPacked: ok=%v, %+v", ok, fi)
   169		}
   170		if ok {
   171			return fi, true, nil
   172		}
   173	
   174		fr, err := schema.NewFileReader(ctx, dh.Fetcher, file)
   175		if err != nil {
   176			return
   177		}
   178		mime := dh.ForceMIME
   179		if mime == "" {
   180			mime = magic.MIMETypeFromReaderAt(fr)
   181		}
   182		if mime == "" {
   183			mime = "application/octet-stream"
   184		}
   185		return fileInfo{
   186			mime:    mime,
   187			name:    fr.FileName(),
   188			size:    fr.Size(),
   189			modtime: fr.ModTime(),
   190			mode:    fr.FileMode(),
   191			rs:      fr,
   192			close:   fr.Close,
   193		}, false, nil
   194	}
   195	
   196	// Fast path for blobpacked.
   197	func fileInfoPacked(ctx context.Context, sh *search.Handler, src blob.Fetcher, r *http.Request, file blob.Ref) (packFileInfo fileInfo, ok bool) {
   198		if sh == nil {
   199			return fileInfo{whyNot: "no search"}, false
   200		}
   201		wf, ok := src.(blobserver.WholeRefFetcher)
   202		if !ok {
   203			return fileInfo{whyNot: "fetcher type"}, false
   204		}
   205		if r != nil && r.Header.Get("Range") != "" {
   206			// TODO: not handled yet. Maybe not even important,
   207			// considering rarity.
   208			return fileInfo{whyNot: "range header"}, false
   209		}
   210		des, err := sh.Describe(ctx, &search.DescribeRequest{BlobRef: file})
   211		if err != nil {
   212			log.Printf("ui: fileInfoPacked: skipping fast path due to error from search: %v", err)
   213			return fileInfo{whyNot: "search error"}, false
   214		}
   215		db, ok := des.Meta[file.String()]
   216		if !ok || db.File == nil {
   217			return fileInfo{whyNot: "search index doesn't know file"}, false
   218		}
   219		fi := db.File
   220		if !fi.WholeRef.Valid() {
   221			return fileInfo{whyNot: "no wholeref from search index"}, false
   222		}
   223	
   224		offset := int64(0)
   225		rc, wholeSize, err := wf.OpenWholeRef(fi.WholeRef, offset)
   226		if err == os.ErrNotExist {
   227			return fileInfo{whyNot: "WholeRefFetcher returned ErrNotexist"}, false
   228		}
   229		if wholeSize != fi.Size {
   230			log.Printf("ui: fileInfoPacked: OpenWholeRef size %d != index size %d; ignoring fast path", wholeSize, fi.Size)
   231			return fileInfo{whyNot: "WholeRefFetcher and index don't agree"}, false
   232		}
   233		if err != nil {
   234			log.Printf("ui: fileInfoPacked: skipping fast path due to error from WholeRefFetcher (%T): %v", src, err)
   235			return fileInfo{whyNot: "WholeRefFetcher error"}, false
   236		}
   237	
   238		var modtime time.Time
   239		if !fi.ModTime.IsAnyZero() {
   240			modtime = fi.ModTime.Time()
   241		} else if !fi.Time.IsAnyZero() {
   242			modtime = fi.Time.Time()
   243		}
   244		// TODO(mpl): it'd be nicer to get the FileMode from the describe response,
   245		// instead of having to fetch the file schema again, but we don't index the
   246		// FileMode for now, so it's not just a matter of adding the FileMode to
   247		// camtypes.FileInfo
   248		fr, err := schema.NewFileReader(ctx, src, file)
   249		fr.Close()
   250		if err != nil {
   251			return fileInfo{whyNot: fmt.Sprintf("cannot open a file reader: %v", err)}, false
   252		}
   253		return fileInfo{
   254			mime:    fi.MIMEType,
   255			name:    fi.FileName,
   256			size:    fi.Size,
   257			modtime: modtime,
   258			mode:    fr.FileMode(),
   259			rs:      readerutil.NewFakeSeeker(rc, fi.Size-offset),
   260			close:   rc.Close,
   261		}, true
   262	}
   263	
   264	// ServeHTTP answers the following queries:
   265	//
   266	// POST:
   267	//
   268	//	?files=sha1-foo,sha1-bar,sha1-baz
   269	//
   270	// Creates a zip archive of the provided files and serves it in the response.
   271	//
   272	// GET:
   273	//
   274	//	/<file-schema-blobref>[?inline=1]
   275	//
   276	// Serves the file described by the requested file schema blobref.
   277	// if inline=1 the Content-Disposition of the response is set to inline, and
   278	// otherwise it set to attachment.
   279	func (dh *DownloadHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
   280		if r.Method == "POST" {
   281			dh.serveZip(w, r)
   282			return
   283		}
   284	
   285		suffix := httputil.PathSuffix(r)
   286		m := downloadPattern.FindStringSubmatch(suffix)
   287		if m == nil {
   288			httputil.ErrorRouting(w, r)
   289			return
   290		}
   291		file, ok := blob.Parse(m[1])
   292		if !ok {
   293			http.Error(w, "Invalid blobref", http.StatusBadRequest)
   294			return
   295		}
   296		// TODO(mpl): make use of m[2] (the optional filename).
   297		dh.ServeFile(w, r, file)
   298	}
   299	
   300	func (dh *DownloadHandler) ServeFile(w http.ResponseWriter, r *http.Request, file blob.Ref) {
   301		ctx := r.Context()
   302		if r.Method != "GET" && r.Method != "HEAD" {
   303			http.Error(w, "Invalid download method", http.StatusBadRequest)
   304			return
   305		}
   306	
   307		if r.Header.Get("If-Modified-Since") != "" {
   308			// Immutable, so any copy's a good copy.
   309			w.WriteHeader(http.StatusNotModified)
   310			return
   311		}
   312	
   313		dh.r = r
   314		fi, packed, err := dh.fileInfo(ctx, file)
   315		if err != nil {
   316			http.Error(w, "Can't serve file: "+err.Error(), http.StatusInternalServerError)
   317			return
   318		}
   319		if !fi.mode.IsRegular() {
   320			http.Error(w, "Not a regular file", http.StatusBadRequest)
   321			return
   322		}
   323		defer fi.close()
   324	
   325		h := w.Header()
   326		h.Set("Content-Length", fmt.Sprint(fi.size))
   327		h.Set("Expires", time.Now().Add(oneYear).Format(http.TimeFormat))
   328		if packed {
   329			h.Set("X-Camlistore-Packed", "1")
   330		}
   331	
   332		fileName := func(ext string) string {
   333			if fi.name != "" {
   334				return fi.name
   335			}
   336			return "file-" + file.String() + ext
   337		}
   338	
   339		if r.FormValue("inline") == "1" || dh.forceInline {
   340			// TODO(mpl): investigate why at least text files have an incorrect MIME.
   341			if fi.mime == "application/octet-stream" {
   342				// Since e.g. plain text files are seen as "application/octet-stream", we force
   343				// check for that, so we can have the browser display them as text if they are
   344				// indeed actually text.
   345				text, err := isText(fi.rs)
   346				if err != nil {
   347					// TODO: https://perkeep.org/issues/1060
   348					httputil.ServeError(w, r, fmt.Errorf("cannot verify MIME type of file: %v", err))
   349					return
   350				}
   351				if text {
   352					fi.mime = "text/plain"
   353				}
   354			}
   355			h.Set("Content-Disposition", "inline")
   356		} else {
   357			w.Header().Set("Content-Disposition", "attachment; filename="+fileName(".dat"))
   358		}
   359		h.Set("Content-Type", fi.mime)
   360	
   361		if r.Method == "HEAD" && r.FormValue("verifycontents") != "" {
   362			vbr, ok := blob.Parse(r.FormValue("verifycontents"))
   363			if !ok {
   364				return
   365			}
   366			hash := vbr.Hash()
   367			if hash == nil {
   368				return
   369			}
   370			io.Copy(hash, fi.rs) // ignore errors, caught later
   371			if vbr.HashMatches(hash) {
   372				w.Header().Set("X-Camli-Contents", vbr.String())
   373			}
   374			return
   375		}
   376	
   377		http.ServeContent(w, r, "", time.Now(), fi.rs)
   378	}
   379	
   380	// isText reports whether the first MB read from rs is valid UTF-8 text.
   381	func isText(rs io.ReadSeeker) (ok bool, err error) {
   382		defer func() {
   383			if _, seekErr := rs.Seek(0, io.SeekStart); seekErr != nil {
   384				if err == nil {
   385					err = seekErr
   386				}
   387			}
   388		}()
   389		var buf bytes.Buffer
   390		if _, err := io.CopyN(&buf, rs, 1e6); err != nil {
   391			if err != io.EOF {
   392				return false, err
   393			}
   394		}
   395		return utf8.Valid(buf.Bytes()), nil
   396	}
   397	
   398	// statFiles stats the given refs and returns an error if any one of them is not
   399	// found.
   400	// It is the responsibility of the caller to check that dh.Fetcher is a
   401	// blobserver.BlobStatter.
   402	func (dh *DownloadHandler) statFiles(refs []blob.Ref) error {
   403		statter, ok := dh.Fetcher.(blobserver.BlobStatter)
   404		if !ok {
   405			return fmt.Errorf("DownloadHandler.Fetcher %T is not a BlobStatter", dh.Fetcher)
   406		}
   407		statted := make(map[blob.Ref]bool)
   408	
   409		err := statter.StatBlobs(context.TODO(), refs, func(sb blob.SizedRef) error {
   410			statted[sb.Ref] = true
   411			return nil
   412		})
   413		if err != nil {
   414			log.Printf("Error statting blob files for download archive: %v", err)
   415			return fmt.Errorf("error looking for files")
   416		}
   417		for _, v := range refs {
   418			if _, ok := statted[v]; !ok {
   419				return fmt.Errorf("%q was not found", v)
   420			}
   421		}
   422		return nil
   423	}
   424	
   425	var allowedFileTypes = map[schema.CamliType]bool{
   426		schema.TypeFile:    true,
   427		schema.TypeSymlink: true,
   428		schema.TypeFIFO:    true,
   429		schema.TypeSocket:  true,
   430	}
   431	
   432	// checkFiles reads, and discards, the file contents for each of the given file refs.
   433	// It is used to check that all files requested for download are readable before
   434	// starting to reply and/or creating a zip archive of them. It recursively
   435	// checks directories as well. It also populates dh.pathByRef.
   436	func (dh *DownloadHandler) checkFiles(ctx context.Context, parentPath string, fileRefs []blob.Ref) error {
   437		// TODO(mpl): add some concurrency
   438		for _, br := range fileRefs {
   439			rc, _, err := dh.Fetcher.Fetch(ctx, br)
   440			if err != nil {
   441				return fmt.Errorf("could not fetch %v: %v", br, err)
   442			}
   443			b, err := schema.BlobFromReader(br, rc)
   444			rc.Close()
   445			if err != nil {
   446				return fmt.Errorf("could not read %v as blob: %v", br, err)
   447			}
   448			tp := b.Type()
   449			if _, ok := allowedFileTypes[tp]; !ok && tp != schema.TypeDirectory {
   450				return fmt.Errorf("%v not a supported file or directory type: %q", br, tp)
   451			}
   452			if tp == schema.TypeDirectory {
   453				dr, err := b.NewDirReader(ctx, dh.Fetcher)
   454				if err != nil {
   455					return fmt.Errorf("could not open %v as directory: %v", br, err)
   456				}
   457				children, err := dr.StaticSet(ctx)
   458				if err != nil {
   459					return fmt.Errorf("could not get dir entries of %v: %v", br, err)
   460				}
   461				if err := dh.checkFiles(ctx, filepath.Join(parentPath, b.FileName()), children); err != nil {
   462					return err
   463				}
   464				continue
   465			}
   466			if tp != schema.TypeFile {
   467				// We only bother checking regular files. symlinks, fifos, and sockets are
   468				// assumed ok.
   469				dh.pathByRef[br] = filepath.Join(parentPath, b.FileName())
   470				continue
   471			}
   472			fr, err := b.NewFileReader(dh.Fetcher)
   473			if err != nil {
   474				return fmt.Errorf("could not open %v: %v", br, err)
   475			}
   476			_, err = io.Copy(io.Discard, fr)
   477			fr.Close()
   478			if err != nil {
   479				return fmt.Errorf("could not read %v: %v", br, err)
   480			}
   481			dh.pathByRef[br] = filepath.Join(parentPath, b.FileName())
   482		}
   483		return nil
   484	}
   485	
   486	// serveZip creates a zip archive from the files provided as
   487	// ?files=sha1-foo,sha1-bar,... and serves it as the response.
   488	func (dh *DownloadHandler) serveZip(w http.ResponseWriter, r *http.Request) {
   489		ctx := r.Context()
   490		if r.Method != "POST" {
   491			http.Error(w, "Invalid download method", http.StatusBadRequest)
   492			return
   493		}
   494	
   495		filesValue := r.FormValue("files")
   496		if filesValue == "" {
   497			http.Error(w, "No file blobRefs specified", http.StatusBadRequest)
   498			return
   499		}
   500		files := strings.Split(filesValue, ",")
   501	
   502		var refs []blob.Ref
   503		for _, file := range files {
   504			br, ok := blob.Parse(file)
   505			if !ok {
   506				http.Error(w, fmt.Sprintf("%q is not a valid blobRef", file), http.StatusBadRequest)
   507				return
   508			}
   509			refs = append(refs, br)
   510		}
   511	
   512		// We check as many things as we can before writing the zip, because
   513		// once we start sending a response we can't http.Error anymore.
   514		var allRefs map[blob.Ref]string
   515		_, ok := (dh.Fetcher).(*cacher.CachingFetcher)
   516		if ok {
   517			// If we have a caching fetcher, allRefs and dh.pathByRef are populated with all
   518			// the input refs plus their children, so we don't have to redo later the recursing
   519			// work that we're alreading doing in checkFiles.
   520			dh.pathByRef = make(map[blob.Ref]string, len(refs))
   521			err := dh.checkFiles(ctx, "", refs)
   522			if err != nil {
   523				http.Error(w, err.Error(), http.StatusInternalServerError)
   524				return
   525			}
   526			allRefs = dh.pathByRef
   527		} else {
   528			_, ok := dh.Fetcher.(blobserver.BlobStatter)
   529			if ok {
   530				if err := dh.statFiles(refs); err != nil {
   531					http.Error(w, err.Error(), http.StatusInternalServerError)
   532					return
   533				}
   534			}
   535			// If we don't have a cacher we don't know yet of all the possible
   536			// children refs, so allRefs is just the input refs, and the
   537			// children will be discovered on the fly, while building the zip archive.
   538			// This is the case even if we have a statter, because statFiles does not
   539			// recurse into directories.
   540			allRefs = make(map[blob.Ref]string, len(refs))
   541			for _, v := range refs {
   542				allRefs[v] = ""
   543			}
   544		}
   545	
   546		h := w.Header()
   547		h.Set("Content-Type", "application/zip")
   548		zipName := "camli-download-" + time.Now().Format(downloadTimeLayout) + ".zip"
   549		h.Set("Content-Disposition", "attachment; filename="+zipName)
   550		zw := zip.NewWriter(w)
   551		dh.r = r
   552		for br := range allRefs {
   553			if err := dh.zipFile(ctx, "", br, zw); err != nil {
   554				log.Printf("error zipping %v: %v", br, err)
   555				// http.Error is of no use since we've already started sending a response
   556				panic(http.ErrAbortHandler)
   557			}
   558		}
   559		if err := zw.Close(); err != nil {
   560			log.Printf("error closing zip stream: %v", err)
   561			panic(http.ErrAbortHandler)
   562		}
   563	}
   564	
   565	// zipFile, if br is a file, adds br to the zip archive that zw writes to. If br
   566	// is a directory, zipFile adds all its files descendants to the zip. parentPath is
   567	// the path to the parent directory of br. It is only used if dh.pathByRef has not
   568	// been populated (i.e. if dh does not use a caching fetcher).
   569	func (dh *DownloadHandler) zipFile(ctx context.Context, parentPath string, br blob.Ref, zw *zip.Writer) error {
   570		if len(dh.pathByRef) == 0 {
   571			// if dh.pathByRef is not populated, we have to check for ourselves now whether
   572			// br is a directory.
   573			di, err := dh.dirInfo(ctx, br)
   574			if err != nil && err != errNotDir {
   575				return err
   576			}
   577			if di.isDir {
   578				for _, v := range di.children {
   579					if err := dh.zipFile(ctx, filepath.Join(parentPath, di.name), v, zw); err != nil {
   580						return err
   581					}
   582				}
   583				return nil
   584			}
   585		}
   586		fi, _, err := dh.fileInfo(ctx, br)
   587		if err != nil {
   588			return err
   589		}
   590		defer fi.close()
   591		filename, ok := dh.pathByRef[br]
   592		if !ok {
   593			// because we're in the len(dh.pathByRef) == 0 case.
   594			filename = filepath.Join(parentPath, fi.name)
   595		}
   596		zh := &zip.FileHeader{
   597			Name:   filename,
   598			Method: zip.Store,
   599		}
   600		zh.SetModTime(fi.modtime)
   601		zh.SetMode(fi.mode)
   602		zfh, err := zw.CreateHeader(zh)
   603		if err != nil {
   604			return err
   605		}
   606		_, err = io.Copy(zfh, fi.rs)
   607		if err != nil {
   608			return err
   609		}
   610		return nil
   611	}