1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16
17
18 package instapaper
19
20 import (
21 "encoding/json"
22 "errors"
23 "fmt"
24 "html/template"
25 "log"
26 "net/http"
27 "net/url"
28 "os"
29 "sort"
30 "strings"
31 "sync"
32 "time"
33
34 "github.com/garyburd/go-oauth/oauth"
35 "go4.org/ctxutil"
36 "go4.org/syncutil"
37
38 "perkeep.org/internal/httputil"
39 "perkeep.org/pkg/importer"
40 "perkeep.org/pkg/schema"
41 "perkeep.org/pkg/schema/nodeattr"
42 "perkeep.org/pkg/search"
43 )
44
45 func init() {
46 importer.Register("instapaper", &imp{})
47 }
48
49 type user struct {
50 UserId int `json:"user_id"`
51 Username string `json:"username"`
52 }
53
54 type folder struct {
55 Title string
56 FolderId json.Number `json:"folder_id"`
57 }
58
59 type bookmark struct {
60 Hash string `json:"hash"`
61 Description string `json:"description"`
62 BookmarkId int `json:"bookmark_id"`
63 PrivateSource string `json:"private_source"`
64 Title string `json:"title"`
65 Url string `json:"url"`
66 ProgressTimestamp int `json:"progress_timestamp"`
67 Time int `json:"time"`
68 Progress float64 `json:"progress"`
69 Starred string `json:"starred"`
70 }
71
72 type highlight struct {
73 HighlightId int `json:"highlight_id"`
74 Text string `json:"text"`
75 Note string `json:"note"`
76 BookmarkId int `json:"bookmark_id"`
77 Time int `json:"time"`
78 Position int `json:"position"`
79 }
80
81 const (
82
83 nodeTypeBookmark = "instapaper.com:bookmark"
84 nodeTypeHighlight = "instapaper.com:highlight"
85
86
87 attrBookmarkId = "instapaper.com:bookmarkId"
88 attrUrl = "instapaper.com:url"
89
90 attrProgress = "instapaper.com:progress"
91
92 attrProgressTimestamp = "instapaper.com:progressTimestamp"
93
94 requestLimit = "500"
95 bookmarksAtOnce = 20
96 runCompleteVersion = "1"
97
98
99 tokenRequestURL = "https://www.instapaper.com/api/1/oauth/access_token"
100 verifyUserRequestURL = "https://www.instapaper.com/api/1/account/verify_credentials"
101 bookmarkListRequestURL = "https://www.instapaper.com/api/1/bookmarks/list"
102 bookmarkTextRequestURL = "https://www.instapaper.com/api/1/bookmarks/get_text"
103 foldersListRequestURL = "https://www.instapaper.com/api/1.1/folders/list"
104 highlightListRequestURL = "https://www.instapaper.com/api/1.1/bookmarks/%d/highlights"
105 )
106
107 var (
108 logger = log.New(os.Stderr, "instapaper.com: ", log.LstdFlags)
109 )
110
111 type imp struct {
112 importer.OAuth1
113 }
114
115 func (*imp) Properties() importer.Properties {
116 return importer.Properties{
117 Title: "Instapaper",
118 Description: "Import full text bookmarks and highlights from an Instapaper account",
119 NeedsAPIKey: true,
120 SupportsIncremental: true,
121 }
122 }
123
124 func (*imp) IsAccountReady(acct *importer.Object) (ready bool, err error) {
125 return acct.Attr(importer.AcctAttrAccessToken) != "" && acct.Attr(importer.AcctAttrUserID) != "", nil
126 }
127
128 func (*imp) SummarizeAccount(acct *importer.Object) string {
129 userID := acct.Attr(importer.AcctAttrUserID)
130 if userID == "" {
131 return "Not configured"
132 }
133 return userID
134 }
135
136 func (imp) ServeSetup(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) error {
137 return tmpl.ExecuteTemplate(w, "serveSetup", ctx)
138 }
139
140 var tmpl = template.Must(template.New("root").Parse(`
141 {{define "serveSetup"}}
142 <h1>Configuring Instapaper Account</h1>
143 <h3>If your Instapaper account does not have a password, leave that field blank. However, a username is required. Passwords are not stored at all and are only used to retrieve an access token.</h3>
144 <form method="get" action="{{.CallbackURL}}">
145 <input type="hidden" name="acct" value="{{.AccountNode.PermanodeRef}}">
146 <table border=0 cellpadding=3>
147 <tr><td align=right>Username</td><td><input name="username" size=50 required></td></tr>
148 <tr><td align=right>Password</td><td><input name="password" type="password" size=50></td></tr>
149 <tr><td align=right></td><td><input type="submit" value="Add"></td></tr>
150 </table>
151 </form>
152 {{end}}
153 `))
154
155 var _ importer.ImporterSetupHTMLer = (*imp)(nil)
156
157 func (im *imp) AccountSetupHTML(host *importer.Host) string {
158 return "<h1>Configuring Instapaper</h1><p>To get an OAuth client ID and secret, <a target=\"_blank\" href=\"https://www.instapaper.com/main/request_oauth_consumer_token\">fill this out</a>. You should receive an email response from Instapaper with the Client ID and Client Secret that you should use in the form above.</p>"
159 }
160
161 func (im *imp) ServeCallback(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) {
162 username := r.FormValue("username")
163 password := r.FormValue("password")
164
165
166 if username == "" {
167 httputil.BadRequestError(w, "Expected a username")
168 return
169 }
170
171 clientID, secret, err := ctx.Credentials()
172 if err != nil {
173 httputil.ServeError(w, r, fmt.Errorf("Credentials error: %v", err))
174 return
175 }
176
177 oauthClient := &oauth.Client{
178 TokenRequestURI: tokenRequestURL,
179 Credentials: oauth.Credentials{
180 Token: clientID,
181 Secret: secret,
182 },
183 }
184 creds, _, err := oauthClient.RequestTokenXAuth(ctxutil.Client(ctx), nil, username, password)
185 if err != nil {
186 httputil.ServeError(w, r, fmt.Errorf("Failed to get access token: %v", err))
187 return
188 }
189
190 user, err := getUserInfo(importer.OAuthContext{Ctx: ctx.Context, Client: oauthClient, Creds: creds})
191 if err != nil {
192 httputil.ServeError(w, r, fmt.Errorf("Failed to verify credentials: %v", err))
193 return
194 }
195
196 if err := ctx.AccountNode.SetAttrs(
197 nodeattr.Title, fmt.Sprintf("Instapaper account: %s", user.Username),
198 importer.AcctAttrAccessToken, creds.Token,
199 importer.AcctAttrAccessTokenSecret, creds.Secret,
200 importer.AcctAttrUserName, user.Username,
201 importer.AcctAttrUserID, fmt.Sprint(user.UserId),
202 ); err != nil {
203 httputil.ServeError(w, r, fmt.Errorf("Error setting attributes: %v", err))
204 return
205 }
206 http.Redirect(w, r, ctx.AccountURL(), http.StatusFound)
207 }
208
209 func (im *imp) Run(ctx *importer.RunContext) (err error) {
210 clientId, secret, err := ctx.Credentials()
211 if err != nil {
212 return fmt.Errorf("no API credentials: %v", err)
213 }
214 acctNode := ctx.AccountNode()
215 accessToken := acctNode.Attr(importer.AcctAttrAccessToken)
216 accessSecret := acctNode.Attr(importer.AcctAttrAccessTokenSecret)
217 if accessToken == "" || accessSecret == "" {
218 return errors.New("access credentials not found")
219 }
220 userID := acctNode.Attr(importer.AcctAttrUserID)
221 if userID == "" {
222 return errors.New("userID hasn't been set by account setup")
223 }
224 r := &run{
225 RunContext: ctx,
226 im: im,
227 incremental: acctNode.Attr(importer.AcctAttrCompletedVersion) == runCompleteVersion,
228 oauthClient: &oauth.Client{
229 Credentials: oauth.Credentials{
230 Token: clientId,
231 Secret: secret,
232 },
233 },
234 accessCreds: &oauth.Credentials{
235 Token: accessToken,
236 Secret: accessSecret,
237 },
238 }
239 folders, err := r.getFolders()
240 if err != nil {
241 return err
242 }
243 if err := r.importBookmarks(userID, folders); err != nil {
244 return err
245 }
246
247 return acctNode.SetAttrs(importer.AcctAttrCompletedVersion, runCompleteVersion)
248 }
249
250 type run struct {
251 *importer.RunContext
252 im *imp
253 incremental bool
254
255 oauthClient *oauth.Client
256 accessCreds *oauth.Credentials
257
258 mu sync.Mutex
259 txtReqs []txtReq
260 }
261
262 func getUserInfo(ctx importer.OAuthContext) (*user, error) {
263 var ui []user
264 if err := ctx.PopulateJSONFromURL(&ui, http.MethodPost, verifyUserRequestURL); err != nil {
265 return nil, err
266 }
267 if ui[0].UserId == 0 {
268 return nil, errors.New("no user returned")
269 }
270 return &ui[0], nil
271 }
272
273 func parseFilename(t string, id string) string {
274 return fmt.Sprintf("%v_%v.html", strings.Replace(t, "/", "-", -1), id)
275 }
276
277 func (r *run) findExistingBookmark(bookmarkId string) (*importer.Object, error) {
278 res, err := r.Host.Searcher().Query(r.Context(), &search.SearchQuery{
279 Constraint: &search.Constraint{
280 Permanode: &search.PermanodeConstraint{
281 Attr: attrBookmarkId,
282 Value: bookmarkId,
283 },
284 },
285 Describe: &search.DescribeRequest{
286 Depth: 1,
287 },
288 })
289 if err != nil {
290 return nil, err
291 }
292 if res.Describe == nil {
293 return nil, os.ErrNotExist
294 }
295 for _, resBlob := range res.Blobs {
296 br := resBlob.Blob
297 desBlob, ok := res.Describe.Meta[br.String()]
298 if !ok || desBlob.Permanode == nil {
299 continue
300 }
301 return r.Host.ObjectFromRef(br)
302 }
303 return nil, os.ErrNotExist
304 }
305
306 func (r *run) getFolders() ([]folder, error) {
307 var folders []folder
308 if err := r.doAPI(&folders, foldersListRequestURL); err != nil {
309 return nil, err
310 }
311 return append(folders,
312 folder{Title: "Unread", FolderId: "unread"},
313 folder{Title: "Starred", FolderId: "starred"},
314 folder{Title: "Archive", FolderId: "archive"},
315 ), nil
316 }
317
318 type txtReq struct {
319 bmNode *importer.Object
320 bm *bookmark
321 }
322
323 func (r *run) importBookmarks(userID string, folders []folder) error {
324 bsParent, err := r.getTopLevelNode("bookmarks")
325 if err != nil {
326 return err
327 }
328 hsParent, err := r.getTopLevelNode("highlights")
329 if err != nil {
330 return err
331 }
332
333 var (
334 gate = syncutil.NewGate(bookmarksAtOnce)
335 grp syncutil.Group
336 )
337
338 for fi := range folders {
339 f := folders[fi]
340 var bList []*bookmark
341
342 err := r.doAPI(&bList, bookmarkListRequestURL, "limit", requestLimit, "folder_id", f.FolderId.String())
343 if err != nil {
344 return err
345 }
346
347 for bi := range bList {
348 select {
349 case <-r.Context().Done():
350 logger.Printf("importer interrupted")
351 return r.Context().Err()
352 default:
353 }
354
355 b := bList[bi]
356 if b.BookmarkId == 0 {
357 continue
358 }
359
360 gate.Start()
361 grp.Go(func() error {
362 defer gate.Done()
363 bNode, dup, err := r.importBookmark(bsParent, b, f.Title)
364 if err != nil {
365 logger.Printf("error importing bookmark %d %v", b.BookmarkId, err)
366 return err
367 }
368 if !r.incremental || !dup {
369 r.mu.Lock()
370 r.txtReqs = append(r.txtReqs, txtReq{bmNode: bNode, bm: b})
371 r.mu.Unlock()
372 }
373 return r.importHighlights(hsParent, bNode, b)
374 })
375 }
376 }
377
378 err = grp.Err()
379 if err != nil {
380 return err
381 }
382
383
384
385
386 for _, req := range r.txtReqs {
387 if err := r.importBookmarkText(req); err != nil {
388 return err
389 }
390 }
391 return nil
392 }
393
394 func (r *run) importBookmark(parent *importer.Object, b *bookmark, folder string) (*importer.Object, bool, error) {
395
396
397
398
399 bmNode, err := parent.ChildPathObjectOrFunc(parseFilename(b.Title, fmt.Sprint(b.BookmarkId)),
400 func() (*importer.Object, error) {
401 found, err := r.findExistingBookmark(fmt.Sprint(b.BookmarkId))
402 if err != nil {
403 if err != os.ErrNotExist {
404 return nil, fmt.Errorf("searching for node with %v %v: %v", attrBookmarkId, b.BookmarkId, err)
405 }
406 return r.Host.NewObject()
407 }
408
409
410
411
412 oldTitle := parseFilename(found.Attr(nodeattr.Title), fmt.Sprint(b.BookmarkId))
413 if err := parent.DelAttr(fmt.Sprintf("camliPath:%s", oldTitle), ""); err != nil {
414 return nil, err
415 }
416 return found, nil
417 })
418 if err != nil {
419 return nil, false, err
420 }
421
422 instapaperUrl := fmt.Sprintf("https://www.instapaper.com/read/%v", b.BookmarkId)
423 attrs := []string{
424 attrBookmarkId, fmt.Sprint(b.BookmarkId),
425 nodeattr.Type, nodeTypeBookmark,
426 nodeattr.DateCreated, schema.RFC3339FromTime(time.Unix(int64(b.Time), 0)),
427 nodeattr.Title, b.Title,
428 nodeattr.Description, b.Description,
429 nodeattr.URL, b.Url,
430 attrUrl, instapaperUrl,
431 attrProgress, fmt.Sprint(b.Progress),
432 attrProgressTimestamp, schema.RFC3339FromTime(time.Unix(int64(b.ProgressTimestamp), 0)),
433 nodeattr.Starred, b.Starred,
434 nodeattr.Folder, folder,
435 }
436
437 changes, err := bmNode.SetAttrs2(attrs...)
438 if err == nil && changes {
439 logger.Printf("imported bookmark %s", b.Url)
440 }
441 return bmNode, !changes, nil
442 }
443
444 func (r *run) importBookmarkText(req txtReq) error {
445 filename := parseFilename(req.bm.Title, fmt.Sprint(req.bm.BookmarkId))
446 form := url.Values{}
447 form.Add("bookmark_id", fmt.Sprint(req.bm.BookmarkId))
448 resp, err := importer.OAuthContext{
449 Ctx: r.Context(),
450 Client: r.oauthClient,
451 Creds: r.accessCreds}.POST(bookmarkTextRequestURL, form)
452 if err != nil {
453 if resp != nil && resp.StatusCode == http.StatusBadRequest {
454
455
456 logger.Printf("no text available for %v: %v", req.bm.Url, err)
457 return nil
458 }
459 return err
460 }
461 defer resp.Body.Close()
462 fileRef, err := schema.WriteFileFromReader(r.Context(), r.Host.Target(), filename, resp.Body)
463 if err != nil {
464 return fmt.Errorf("error storing bookmark content: %v", err)
465 }
466 err = req.bmNode.SetAttr("camliContent", fileRef.String())
467 if err == nil {
468 logger.Printf("imported text for %s", req.bm.Url)
469 }
470 return err
471 }
472
473 func (r *run) importHighlights(parent *importer.Object, bNode *importer.Object, b *bookmark) error {
474 var hList []*highlight
475 err := r.doAPI(&hList, fmt.Sprintf(highlightListRequestURL, b.BookmarkId))
476 if err != nil {
477 return err
478 }
479
480
481
482 sort.Slice(hList, func(i, j int) bool {
483 return hList[i].Time > hList[j].Time
484 })
485
486 for hi := range hList {
487 h := hList[hi]
488 dup, err := r.importHighlight(parent, bNode, h)
489 if err != nil {
490 logger.Printf("error importing highlight %d %v", h.HighlightId, err)
491 }
492 if dup && r.incremental {
493 logger.Printf("incremental highlights import found end batch")
494 break
495 }
496 }
497 return nil
498 }
499
500 func (r *run) importHighlight(parent *importer.Object, bNode *importer.Object, h *highlight) (bool, error) {
501 hNode, err := parent.ChildPathObject(fmt.Sprint(h.HighlightId))
502 if err != nil {
503 return false, err
504 }
505
506 attrs := []string{
507 nodeattr.Type, nodeTypeHighlight,
508 nodeattr.DateCreated, schema.RFC3339FromTime(time.Unix(int64(h.Time), 0)),
509 nodeattr.Title, bNode.Attr(nodeattr.Title),
510 nodeattr.Content, h.Text,
511 nodeattr.Description, h.Note,
512 attrBookmarkId, fmt.Sprint(h.BookmarkId),
513 }
514
515 changes, err := hNode.SetAttrs2(attrs...)
516 return !changes, err
517 }
518
519 func (r *run) getTopLevelNode(path string) (*importer.Object, error) {
520 acctNode := r.AccountNode()
521 root := r.RootNode()
522 username := acctNode.Attr(importer.AcctAttrUserName)
523 rootTitle := fmt.Sprintf("Instapaper Data for %s", username)
524 if err := root.SetAttrs(nodeattr.Title, rootTitle, "camliImportRoot", "instapaper-"+username); err != nil {
525 return nil, err
526 }
527
528 obj, err := root.ChildPathObject(path)
529 if err != nil {
530 return nil, err
531 }
532
533 var title string
534 switch path {
535 case "bookmarks":
536 title = fmt.Sprintf("Bookmarks for %s", acctNode.Attr(importer.AcctAttrUserName))
537 case "highlights":
538 title = fmt.Sprintf("Highlights for %s", acctNode.Attr(importer.AcctAttrUserName))
539 }
540 return obj, obj.SetAttr(nodeattr.Title, title)
541 }
542
543 func (r *run) doAPI(result interface{}, apiUrl string, keyval ...string) error {
544 return importer.OAuthContext{
545 Ctx: r.Context(),
546 Client: r.oauthClient,
547 Creds: r.accessCreds}.PopulateJSONFromURL(result, http.MethodPost, apiUrl, keyval...)
548 }