Home Download Docs Code Community
     1	/*
     2	Copyright 2018 The Perkeep Authors
     3	
     4	Licensed under the Apache License, Version 2.0 (the "License");
     5	you may not use this file except in compliance with the License.
     6	You may obtain a copy of the License at
     7	
     8	     http://www.apache.org/licenses/LICENSE-2.0
     9	
    10	Unless required by applicable law or agreed to in writing, software
    11	distributed under the License is distributed on an "AS IS" BASIS,
    12	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13	See the License for the specific language governing permissions and
    14	limitations under the License.
    15	*/
    16	
    17	// Package instapaper implements a instapaper.com importer.
    18	package instapaper // import "perkeep.org/pkg/importer/instapaper"
    19	
    20	import (
    21		"encoding/json"
    22		"errors"
    23		"fmt"
    24		"html/template"
    25		"log"
    26		"net/http"
    27		"net/url"
    28		"os"
    29		"sort"
    30		"strings"
    31		"sync"
    32		"time"
    33	
    34		"github.com/garyburd/go-oauth/oauth"
    35		"go4.org/ctxutil"
    36		"go4.org/syncutil"
    37	
    38		"perkeep.org/internal/httputil"
    39		"perkeep.org/pkg/importer"
    40		"perkeep.org/pkg/schema"
    41		"perkeep.org/pkg/schema/nodeattr"
    42		"perkeep.org/pkg/search"
    43	)
    44	
    45	func init() {
    46		importer.Register("instapaper", &imp{})
    47	}
    48	
    49	type user struct {
    50		UserId   int    `json:"user_id"`
    51		Username string `json:"username"`
    52	}
    53	
    54	type folder struct {
    55		Title    string
    56		FolderId json.Number `json:"folder_id"`
    57	}
    58	
    59	type bookmark struct {
    60		Hash              string  `json:"hash"`
    61		Description       string  `json:"description"`
    62		BookmarkId        int     `json:"bookmark_id"`
    63		PrivateSource     string  `json:"private_source"`
    64		Title             string  `json:"title"`
    65		Url               string  `json:"url"`
    66		ProgressTimestamp int     `json:"progress_timestamp"`
    67		Time              int     `json:"time"`
    68		Progress          float64 `json:"progress"`
    69		Starred           string  `json:"starred"`
    70	}
    71	
    72	type highlight struct {
    73		HighlightId int    `json:"highlight_id"`
    74		Text        string `json:"text"`
    75		Note        string `json:"note"`
    76		BookmarkId  int    `json:"bookmark_id"`
    77		Time        int    `json:"time"`
    78		Position    int    `json:"position"`
    79	}
    80	
    81	const (
    82		// Import Types
    83		nodeTypeBookmark  = "instapaper.com:bookmark"
    84		nodeTypeHighlight = "instapaper.com:highlight"
    85	
    86		// Import Attributes
    87		attrBookmarkId = "instapaper.com:bookmarkId"
    88		attrUrl        = "instapaper.com:url"
    89		// Progress is the amount of the bookmark text Instapaper says you've already read.
    90		attrProgress = "instapaper.com:progress"
    91		// ProgressTimestamp is the date/time a user last read a portion or all of the bookmark's text.
    92		attrProgressTimestamp = "instapaper.com:progressTimestamp"
    93	
    94		requestLimit       = "500" // max number of bookmarks that Instapaper will return
    95		bookmarksAtOnce    = 20    // how many bookmarks to import at once
    96		runCompleteVersion = "1"
    97	
    98		// API URLs
    99		tokenRequestURL         = "https://www.instapaper.com/api/1/oauth/access_token"
   100		verifyUserRequestURL    = "https://www.instapaper.com/api/1/account/verify_credentials"
   101		bookmarkListRequestURL  = "https://www.instapaper.com/api/1/bookmarks/list"
   102		bookmarkTextRequestURL  = "https://www.instapaper.com/api/1/bookmarks/get_text"
   103		foldersListRequestURL   = "https://www.instapaper.com/api/1.1/folders/list"
   104		highlightListRequestURL = "https://www.instapaper.com/api/1.1/bookmarks/%d/highlights"
   105	)
   106	
   107	var (
   108		logger = log.New(os.Stderr, "instapaper.com: ", log.LstdFlags)
   109	)
   110	
   111	type imp struct {
   112		importer.OAuth1 // for CallbackRequestAccount and CallbackURLParameters
   113	}
   114	
   115	func (*imp) Properties() importer.Properties {
   116		return importer.Properties{
   117			Title:               "Instapaper",
   118			Description:         "Import full text bookmarks and highlights from an Instapaper account",
   119			NeedsAPIKey:         true,
   120			SupportsIncremental: true,
   121		}
   122	}
   123	
   124	func (*imp) IsAccountReady(acct *importer.Object) (ready bool, err error) {
   125		return acct.Attr(importer.AcctAttrAccessToken) != "" && acct.Attr(importer.AcctAttrUserID) != "", nil
   126	}
   127	
   128	func (*imp) SummarizeAccount(acct *importer.Object) string {
   129		userID := acct.Attr(importer.AcctAttrUserID)
   130		if userID == "" {
   131			return "Not configured"
   132		}
   133		return userID
   134	}
   135	
   136	func (imp) ServeSetup(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) error {
   137		return tmpl.ExecuteTemplate(w, "serveSetup", ctx)
   138	}
   139	
   140	var tmpl = template.Must(template.New("root").Parse(`
   141	{{define "serveSetup"}}
   142	<h1>Configuring Instapaper Account</h1>
   143	<h3>If your Instapaper account does not have a password, leave that field blank. However, a username is required. Passwords are not stored at all and are only used to retrieve an access token.</h3>
   144	<form method="get" action="{{.CallbackURL}}">
   145	  <input type="hidden" name="acct" value="{{.AccountNode.PermanodeRef}}">
   146	  <table border=0 cellpadding=3>
   147	  <tr><td align=right>Username</td><td><input name="username" size=50 required></td></tr>
   148	  <tr><td align=right>Password</td><td><input name="password" type="password" size=50></td></tr>
   149	  <tr><td align=right></td><td><input type="submit" value="Add"></td></tr>
   150	  </table>
   151	</form>
   152	{{end}}
   153	`))
   154	
   155	var _ importer.ImporterSetupHTMLer = (*imp)(nil)
   156	
   157	func (im *imp) AccountSetupHTML(host *importer.Host) string {
   158		return "<h1>Configuring Instapaper</h1><p>To get an OAuth client ID and secret, <a target=\"_blank\" href=\"https://www.instapaper.com/main/request_oauth_consumer_token\">fill this out</a>. You should receive an email response from Instapaper with the Client ID and Client Secret that you should use in the form above.</p>"
   159	}
   160	
   161	func (im *imp) ServeCallback(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) {
   162		username := r.FormValue("username")
   163		password := r.FormValue("password")
   164	
   165		// We have to assume password can be blank as Instapaper does not require a password
   166		if username == "" {
   167			httputil.BadRequestError(w, "Expected a username")
   168			return
   169		}
   170	
   171		clientID, secret, err := ctx.Credentials()
   172		if err != nil {
   173			httputil.ServeError(w, r, fmt.Errorf("Credentials error: %v", err))
   174			return
   175		}
   176	
   177		oauthClient := &oauth.Client{
   178			TokenRequestURI: tokenRequestURL,
   179			Credentials: oauth.Credentials{
   180				Token:  clientID,
   181				Secret: secret,
   182			},
   183		}
   184		creds, _, err := oauthClient.RequestTokenXAuth(ctxutil.Client(ctx), nil, username, password)
   185		if err != nil {
   186			httputil.ServeError(w, r, fmt.Errorf("Failed to get access token: %v", err))
   187			return
   188		}
   189	
   190		user, err := getUserInfo(importer.OAuthContext{Ctx: ctx.Context, Client: oauthClient, Creds: creds})
   191		if err != nil {
   192			httputil.ServeError(w, r, fmt.Errorf("Failed to verify credentials: %v", err))
   193			return
   194		}
   195	
   196		if err := ctx.AccountNode.SetAttrs(
   197			nodeattr.Title, fmt.Sprintf("Instapaper account: %s", user.Username),
   198			importer.AcctAttrAccessToken, creds.Token,
   199			importer.AcctAttrAccessTokenSecret, creds.Secret,
   200			importer.AcctAttrUserName, user.Username,
   201			importer.AcctAttrUserID, fmt.Sprint(user.UserId),
   202		); err != nil {
   203			httputil.ServeError(w, r, fmt.Errorf("Error setting attributes: %v", err))
   204			return
   205		}
   206		http.Redirect(w, r, ctx.AccountURL(), http.StatusFound)
   207	}
   208	
   209	func (im *imp) Run(ctx *importer.RunContext) (err error) {
   210		clientId, secret, err := ctx.Credentials()
   211		if err != nil {
   212			return fmt.Errorf("no API credentials: %v", err)
   213		}
   214		acctNode := ctx.AccountNode()
   215		accessToken := acctNode.Attr(importer.AcctAttrAccessToken)
   216		accessSecret := acctNode.Attr(importer.AcctAttrAccessTokenSecret)
   217		if accessToken == "" || accessSecret == "" {
   218			return errors.New("access credentials not found")
   219		}
   220		userID := acctNode.Attr(importer.AcctAttrUserID)
   221		if userID == "" {
   222			return errors.New("userID hasn't been set by account setup")
   223		}
   224		r := &run{
   225			RunContext:  ctx,
   226			im:          im,
   227			incremental: acctNode.Attr(importer.AcctAttrCompletedVersion) == runCompleteVersion,
   228			oauthClient: &oauth.Client{
   229				Credentials: oauth.Credentials{
   230					Token:  clientId,
   231					Secret: secret,
   232				},
   233			},
   234			accessCreds: &oauth.Credentials{
   235				Token:  accessToken,
   236				Secret: accessSecret,
   237			},
   238		}
   239		folders, err := r.getFolders()
   240		if err != nil {
   241			return err
   242		}
   243		if err := r.importBookmarks(userID, folders); err != nil {
   244			return err
   245		}
   246	
   247		return acctNode.SetAttrs(importer.AcctAttrCompletedVersion, runCompleteVersion)
   248	}
   249	
   250	type run struct {
   251		*importer.RunContext
   252		im          *imp
   253		incremental bool
   254	
   255		oauthClient *oauth.Client
   256		accessCreds *oauth.Credentials
   257	
   258		mu      sync.Mutex
   259		txtReqs []txtReq
   260	}
   261	
   262	func getUserInfo(ctx importer.OAuthContext) (*user, error) {
   263		var ui []user
   264		if err := ctx.PopulateJSONFromURL(&ui, http.MethodPost, verifyUserRequestURL); err != nil {
   265			return nil, err
   266		}
   267		if ui[0].UserId == 0 {
   268			return nil, errors.New("no user returned")
   269		}
   270		return &ui[0], nil
   271	}
   272	
   273	func parseFilename(t string, id string) string {
   274		return fmt.Sprintf("%v_%v.html", strings.Replace(t, "/", "-", -1), id)
   275	}
   276	
   277	func (r *run) findExistingBookmark(bookmarkId string) (*importer.Object, error) {
   278		res, err := r.Host.Searcher().Query(r.Context(), &search.SearchQuery{
   279			Constraint: &search.Constraint{
   280				Permanode: &search.PermanodeConstraint{
   281					Attr:  attrBookmarkId,
   282					Value: bookmarkId,
   283				},
   284			},
   285			Describe: &search.DescribeRequest{
   286				Depth: 1,
   287			},
   288		})
   289		if err != nil {
   290			return nil, err
   291		}
   292		if res.Describe == nil {
   293			return nil, os.ErrNotExist
   294		}
   295		for _, resBlob := range res.Blobs {
   296			br := resBlob.Blob
   297			desBlob, ok := res.Describe.Meta[br.String()]
   298			if !ok || desBlob.Permanode == nil {
   299				continue
   300			}
   301			return r.Host.ObjectFromRef(br)
   302		}
   303		return nil, os.ErrNotExist
   304	}
   305	
   306	func (r *run) getFolders() ([]folder, error) {
   307		var folders []folder
   308		if err := r.doAPI(&folders, foldersListRequestURL); err != nil {
   309			return nil, err
   310		}
   311		return append(folders,
   312			folder{Title: "Unread", FolderId: "unread"},
   313			folder{Title: "Starred", FolderId: "starred"},
   314			folder{Title: "Archive", FolderId: "archive"},
   315		), nil
   316	}
   317	
   318	type txtReq struct {
   319		bmNode *importer.Object
   320		bm     *bookmark
   321	}
   322	
   323	func (r *run) importBookmarks(userID string, folders []folder) error {
   324		bsParent, err := r.getTopLevelNode("bookmarks")
   325		if err != nil {
   326			return err
   327		}
   328		hsParent, err := r.getTopLevelNode("highlights")
   329		if err != nil {
   330			return err
   331		}
   332	
   333		var (
   334			gate = syncutil.NewGate(bookmarksAtOnce)
   335			grp  syncutil.Group
   336		)
   337	
   338		for fi := range folders {
   339			f := folders[fi]
   340			var bList []*bookmark
   341	
   342			err := r.doAPI(&bList, bookmarkListRequestURL, "limit", requestLimit, "folder_id", f.FolderId.String())
   343			if err != nil {
   344				return err
   345			}
   346	
   347			for bi := range bList {
   348				select {
   349				case <-r.Context().Done():
   350					logger.Printf("importer interrupted")
   351					return r.Context().Err()
   352				default:
   353				}
   354	
   355				b := bList[bi]
   356				if b.BookmarkId == 0 {
   357					continue // ignore non-bookmark objects included in the response
   358				}
   359	
   360				gate.Start()
   361				grp.Go(func() error {
   362					defer gate.Done()
   363					bNode, dup, err := r.importBookmark(bsParent, b, f.Title)
   364					if err != nil {
   365						logger.Printf("error importing bookmark %d %v", b.BookmarkId, err)
   366						return err
   367					}
   368					if !r.incremental || !dup {
   369						r.mu.Lock()
   370						r.txtReqs = append(r.txtReqs, txtReq{bmNode: bNode, bm: b})
   371						r.mu.Unlock()
   372					}
   373					return r.importHighlights(hsParent, bNode, b)
   374				})
   375			}
   376		}
   377	
   378		err = grp.Err()
   379		if err != nil {
   380			return err
   381		}
   382	
   383		// Process requests for bookmark text serially because
   384		// Instapaper's API TOS specify that /get_text requests must be performed in series.
   385		// All other API requests can happen in parallel.
   386		for _, req := range r.txtReqs {
   387			if err := r.importBookmarkText(req); err != nil {
   388				return err
   389			}
   390		}
   391		return nil
   392	}
   393	
   394	func (r *run) importBookmark(parent *importer.Object, b *bookmark, folder string) (*importer.Object, bool, error) {
   395		// Find an existing permanode by camliPath:{filename} on the parent node.
   396		// If one doesn't exist, try searching for any permanode that has a
   397		// matching instapaper.com:bookmarkId attribute in case the title, which
   398		// is mutable, was changed.
   399		bmNode, err := parent.ChildPathObjectOrFunc(parseFilename(b.Title, fmt.Sprint(b.BookmarkId)),
   400			func() (*importer.Object, error) {
   401				found, err := r.findExistingBookmark(fmt.Sprint(b.BookmarkId))
   402				if err != nil {
   403					if err != os.ErrNotExist {
   404						return nil, fmt.Errorf("searching for node with %v %v: %v", attrBookmarkId, b.BookmarkId, err)
   405					}
   406					return r.Host.NewObject()
   407				}
   408				// If an existing permanode was found by BookmarkId, that means the
   409				// bookmark's title was changed. So, delete the old camliPath which
   410				// was based on the old title so we don't have two camliPaths on
   411				// the parent pointing to the same permanode.
   412				oldTitle := parseFilename(found.Attr(nodeattr.Title), fmt.Sprint(b.BookmarkId))
   413				if err := parent.DelAttr(fmt.Sprintf("camliPath:%s", oldTitle), ""); err != nil {
   414					return nil, err
   415				}
   416				return found, nil
   417			})
   418		if err != nil {
   419			return nil, false, err
   420		}
   421	
   422		instapaperUrl := fmt.Sprintf("https://www.instapaper.com/read/%v", b.BookmarkId)
   423		attrs := []string{
   424			attrBookmarkId, fmt.Sprint(b.BookmarkId),
   425			nodeattr.Type, nodeTypeBookmark,
   426			nodeattr.DateCreated, schema.RFC3339FromTime(time.Unix(int64(b.Time), 0)),
   427			nodeattr.Title, b.Title,
   428			nodeattr.Description, b.Description,
   429			nodeattr.URL, b.Url,
   430			attrUrl, instapaperUrl,
   431			attrProgress, fmt.Sprint(b.Progress),
   432			attrProgressTimestamp, schema.RFC3339FromTime(time.Unix(int64(b.ProgressTimestamp), 0)),
   433			nodeattr.Starred, b.Starred,
   434			nodeattr.Folder, folder,
   435		}
   436	
   437		changes, err := bmNode.SetAttrs2(attrs...)
   438		if err == nil && changes {
   439			logger.Printf("imported bookmark %s", b.Url)
   440		}
   441		return bmNode, !changes, nil
   442	}
   443	
   444	func (r *run) importBookmarkText(req txtReq) error {
   445		filename := parseFilename(req.bm.Title, fmt.Sprint(req.bm.BookmarkId))
   446		form := url.Values{}
   447		form.Add("bookmark_id", fmt.Sprint(req.bm.BookmarkId))
   448		resp, err := importer.OAuthContext{
   449			Ctx:    r.Context(),
   450			Client: r.oauthClient,
   451			Creds:  r.accessCreds}.POST(bookmarkTextRequestURL, form)
   452		if err != nil {
   453			if resp != nil && resp.StatusCode == http.StatusBadRequest {
   454				// Ignore 400 Bad Request HTTP response codes for bookmark text given some bookmarks won't have full text available but we do not
   455				// know which ones until we make the /get_text request and the call fails with a 400 status.
   456				logger.Printf("no text available for %v: %v", req.bm.Url, err)
   457				return nil
   458			}
   459			return err
   460		}
   461		defer resp.Body.Close()
   462		fileRef, err := schema.WriteFileFromReader(r.Context(), r.Host.Target(), filename, resp.Body)
   463		if err != nil {
   464			return fmt.Errorf("error storing bookmark content: %v", err)
   465		}
   466		err = req.bmNode.SetAttr("camliContent", fileRef.String())
   467		if err == nil {
   468			logger.Printf("imported text for %s", req.bm.Url)
   469		}
   470		return err
   471	}
   472	
   473	func (r *run) importHighlights(parent *importer.Object, bNode *importer.Object, b *bookmark) error {
   474		var hList []*highlight
   475		err := r.doAPI(&hList, fmt.Sprintf(highlightListRequestURL, b.BookmarkId))
   476		if err != nil {
   477			return err
   478		}
   479	
   480		// Given Instapaper's API returns highlights sorted by Time in ASC, we need to sort by Time DESC to make the newest
   481		// highlights show up first so we can quit importing early on incremental runs.
   482		sort.Slice(hList, func(i, j int) bool {
   483			return hList[i].Time > hList[j].Time
   484		})
   485	
   486		for hi := range hList {
   487			h := hList[hi]
   488			dup, err := r.importHighlight(parent, bNode, h)
   489			if err != nil {
   490				logger.Printf("error importing highlight %d %v", h.HighlightId, err)
   491			}
   492			if dup && r.incremental {
   493				logger.Printf("incremental highlights import found end batch")
   494				break
   495			}
   496		}
   497		return nil
   498	}
   499	
   500	func (r *run) importHighlight(parent *importer.Object, bNode *importer.Object, h *highlight) (bool, error) {
   501		hNode, err := parent.ChildPathObject(fmt.Sprint(h.HighlightId))
   502		if err != nil {
   503			return false, err
   504		}
   505	
   506		attrs := []string{
   507			nodeattr.Type, nodeTypeHighlight,
   508			nodeattr.DateCreated, schema.RFC3339FromTime(time.Unix(int64(h.Time), 0)),
   509			nodeattr.Title, bNode.Attr(nodeattr.Title),
   510			nodeattr.Content, h.Text,
   511			nodeattr.Description, h.Note,
   512			attrBookmarkId, fmt.Sprint(h.BookmarkId),
   513		}
   514	
   515		changes, err := hNode.SetAttrs2(attrs...)
   516		return !changes, err
   517	}
   518	
   519	func (r *run) getTopLevelNode(path string) (*importer.Object, error) {
   520		acctNode := r.AccountNode()
   521		root := r.RootNode()
   522		username := acctNode.Attr(importer.AcctAttrUserName)
   523		rootTitle := fmt.Sprintf("Instapaper Data for %s", username)
   524		if err := root.SetAttrs(nodeattr.Title, rootTitle, "camliImportRoot", "instapaper-"+username); err != nil {
   525			return nil, err
   526		}
   527	
   528		obj, err := root.ChildPathObject(path)
   529		if err != nil {
   530			return nil, err
   531		}
   532	
   533		var title string
   534		switch path {
   535		case "bookmarks":
   536			title = fmt.Sprintf("Bookmarks for %s", acctNode.Attr(importer.AcctAttrUserName))
   537		case "highlights":
   538			title = fmt.Sprintf("Highlights for %s", acctNode.Attr(importer.AcctAttrUserName))
   539		}
   540		return obj, obj.SetAttr(nodeattr.Title, title)
   541	}
   542	
   543	func (r *run) doAPI(result interface{}, apiUrl string, keyval ...string) error {
   544		return importer.OAuthContext{
   545			Ctx:    r.Context(),
   546			Client: r.oauthClient,
   547			Creds:  r.accessCreds}.PopulateJSONFromURL(result, http.MethodPost, apiUrl, keyval...)
   548	}
Website layout inspired by memcached.
Content by the authors.