Home Download Docs Code Community
     1	/*
     2	Copyright 2014 The Perkeep Authors
     3	
     4	Licensed under the Apache License, Version 2.0 (the "License");
     5	you may not use this file except in compliance with the License.
     6	You may obtain a copy of the License at
     7	
     8	     http://www.apache.org/licenses/LICENSE-2.0
     9	
    10	Unless required by applicable law or agreed to in writing, software
    11	distributed under the License is distributed on an "AS IS" BASIS,
    12	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13	See the License for the specific language governing permissions and
    14	limitations under the License.
    15	*/
    16	
    17	// Package twitter implements a twitter.com importer.
    18	package twitter // import "perkeep.org/pkg/importer/twitter"
    19	
    20	import (
    21		"archive/zip"
    22		"bufio"
    23		"bytes"
    24		"encoding/json"
    25		"errors"
    26		"fmt"
    27		"html"
    28		"io"
    29		"log"
    30		"net/http"
    31		"net/url"
    32		"os"
    33		"path"
    34		"regexp"
    35		"strconv"
    36		"strings"
    37		"sync"
    38		"time"
    39	
    40		"perkeep.org/internal/httputil"
    41		"perkeep.org/pkg/blob"
    42		"perkeep.org/pkg/importer"
    43		"perkeep.org/pkg/schema"
    44		"perkeep.org/pkg/schema/nodeattr"
    45	
    46		"github.com/garyburd/go-oauth/oauth"
    47	
    48		"go4.org/ctxutil"
    49		"go4.org/syncutil"
    50	)
    51	
    52	const (
    53		apiURL                        = "https://api.twitter.com/1.1/"
    54		temporaryCredentialRequestURL = "https://api.twitter.com/oauth/request_token"
    55		resourceOwnerAuthorizationURL = "https://api.twitter.com/oauth/authorize"
    56		tokenRequestURL               = "https://api.twitter.com/oauth/access_token"
    57		userInfoAPIPath               = "account/verify_credentials.json"
    58		userTimeLineAPIPath           = "statuses/user_timeline.json"
    59		userLikesAPIPath              = "favorites/list.json"
    60	
    61		// runCompleteVersion is a cache-busting version number of the
    62		// importer code. It should be incremented whenever the
    63		// behavior of this importer is updated enough to warrant a
    64		// complete run.  Otherwise, if the importer runs to
    65		// completion, this version number is recorded on the account
    66		// permanode and subsequent importers can stop early.
    67		runCompleteVersion = "5"
    68	
    69		// acctAttrTweetZip specifies an optional attribute for the account permanode.
    70		// If set, it should be of a "file" schema blob referencing the tweets.zip
    71		// file that Twitter makes available for the full archive download.
    72		// The Twitter API doesn't go back forever in time, so if you started using
    73		// the Perkeep importer too late, you need to "pk-put file tweets.zip"
    74		// once downloading it from Twitter, and then:
    75		//   $ pk-put attr <acct-permanode> twitterArchiveZipFileRef <zip-fileref>
    76		// ... and re-do an import.
    77		acctAttrTweetZip = "twitterArchiveZipFileRef"
    78	
    79		// acctAttrImportLikes specifies an optional attribute for the account permanode.
    80		// If set to true likes are imported via the twitter API.
    81		// You can enable importing likes like this:
    82		//   $ pk-put attr <acct-permanode> twitterImportLikes true
    83		// ... and re-do an import.
    84		acctAttrImportLikes = "twitterImportLikes"
    85	
    86		// acctAttrZipDoneVersion is updated at the end of a successful zip import and
    87		// is used to determine whether the zip file needs to be re-imported in a future run.
    88		acctAttrZipDoneVersion = "twitterZipDoneVersion" // == "<fileref>:<runCompleteVersion>"
    89	
    90		// Per-tweet note of how we imported it: either "zip" or "api"
    91		attrImportMethod = "twitterImportMethod"
    92	
    93		tweetRequestLimit = 200 // max number of tweets we can get in a user_timeline request
    94		tweetsAtOnce      = 20  // how many tweets to import at once
    95	
    96		// A tweet is stored as a permanode with the "twitter.com:tweet" camliNodeType value.
    97		nodeTypeTweet = "twitter.com:tweet"
    98	
    99		// A like is stored as a permanode with the "twitter.com:like" camliNodeType value.
   100		nodeTypeLike = "twitter.com:like"
   101	)
   102	
   103	var oAuthURIs = importer.OAuthURIs{
   104		TemporaryCredentialRequestURI: temporaryCredentialRequestURL,
   105		ResourceOwnerAuthorizationURI: resourceOwnerAuthorizationURL,
   106		TokenRequestURI:               tokenRequestURL,
   107	}
   108	
   109	func init() {
   110		importer.Register("twitter", &imp{})
   111	}
   112	
   113	var _ importer.ImporterSetupHTMLer = (*imp)(nil)
   114	
   115	type imp struct {
   116		importer.OAuth1 // for CallbackRequestAccount and CallbackURLParameters
   117	}
   118	
   119	func (*imp) Properties() importer.Properties {
   120		return importer.Properties{
   121			Title:       "Twitter",
   122			Description: "import tweets and media from tweets",
   123			// TODO: doc URL for linking to info on historical tweets from ZIP files beyond API limit
   124			SupportsIncremental: true,
   125			NeedsAPIKey:         true,
   126		}
   127	}
   128	
   129	func (im *imp) IsAccountReady(acctNode *importer.Object) (ok bool, err error) {
   130		if acctNode.Attr(importer.AcctAttrUserID) != "" && acctNode.Attr(importer.AcctAttrAccessToken) != "" {
   131			return true, nil
   132		}
   133		return false, nil
   134	}
   135	
   136	func (im *imp) SummarizeAccount(acct *importer.Object) string {
   137		ok, err := im.IsAccountReady(acct)
   138		if err != nil {
   139			return "Not configured; error = " + err.Error()
   140		}
   141		if !ok {
   142			return "Not configured"
   143		}
   144		s := fmt.Sprintf("@%s (%s), twitter id %s",
   145			acct.Attr(importer.AcctAttrUserName),
   146			acct.Attr(importer.AcctAttrName),
   147			acct.Attr(importer.AcctAttrUserID),
   148		)
   149		if acct.Attr(acctAttrTweetZip) != "" {
   150			s += " + zip file"
   151		}
   152		return s
   153	}
   154	
   155	func (im *imp) AccountSetupHTML(host *importer.Host) string {
   156		base := host.ImporterBaseURL() + "twitter"
   157		return fmt.Sprintf(`
   158	<h1>Configuring Twitter</h1>
   159	<p>Visit <a href='https://apps.twitter.com/'>https://apps.twitter.com/</a> and click "Create New App".</p>
   160	<p>Use the following settings:</p>
   161	<ul>
   162	  <li>Name: Does not matter. (camlistore-importer).</li>
   163	  <li>Description: Does not matter. (imports twitter data into camlistore).</li>
   164	  <li>Website: <b>%s</b></li>
   165	  <li>Callback URL: <b>%s</b></li>
   166	</ul>
   167	<!-- TODO(mpl): use CSS to style it to 80 chars wide instead of doing it in source -->
   168	<p>
   169	Click "Create your Twitter application".You should be redirected to the</br>
   170	Application Management page of your newly created application.</br>
   171	Go to the "Keys and Access Tokens" tab. Copy the "Consumer Key (API Key)" and</br>
   172	"Consumer Secret (API Secret)" into the "Client ID" and "Client Secret" boxes</br>
   173	above.
   174	</p>
   175	<p>
   176	Note that the twitter API prevents us from getting more than 3200 tweets<br>
   177	(including retweets) through your user timeline. So if you have more than that<br>
   178	limit (and want to get them all), after you have configured this account, you<br>
   179	need to download all your data as a zip first. Which you can do on your twitter<br>
   180	page, at: "Settings and Privacy", "Your Twitter data", "Download your Twitter<br>
   181	data". Then upload it to your instance with "pk-put file tweets.zip" (this will<br>
   182	return the zip-fileref), and signal the twitter importer that you have it, with<br>
   183	"pk-put attr &lt;acct-permanode&gt; twitterArchiveZipFileRef &lt;zip-fileref&gt;".<br>
   184	Then you can start running the importer.
   185	</p>
   186	<p>
   187	If you want to import likes as well, please run <br>
   188	"pk-put attr &lt;acct-permanode&gt; twitterImportLikes true" to enable it.
   189	</p>
   190	`, base, base+"/callback")
   191	}
   192	
   193	// A run is our state for a given run of the importer.
   194	type run struct {
   195		*importer.RunContext
   196		im          *imp
   197		incremental bool // whether we've completed a run in the past
   198	
   199		oauthClient *oauth.Client      // No need to guard, used read-only.
   200		accessCreds *oauth.Credentials // No need to guard, used read-only.
   201	
   202		mu     sync.Mutex // guards anyErr
   203		anyErr bool
   204	}
   205	
   206	var forceFullImport, _ = strconv.ParseBool(os.Getenv("CAMLI_TWITTER_FULL_IMPORT"))
   207	
   208	func (im *imp) Run(ctx *importer.RunContext) error {
   209		clientId, secret, err := ctx.Credentials()
   210		if err != nil {
   211			return fmt.Errorf("no API credentials: %v", err)
   212		}
   213		acctNode := ctx.AccountNode()
   214		accessToken := acctNode.Attr(importer.AcctAttrAccessToken)
   215		accessSecret := acctNode.Attr(importer.AcctAttrAccessTokenSecret)
   216		if accessToken == "" || accessSecret == "" {
   217			return errors.New("access credentials not found")
   218		}
   219		r := &run{
   220			RunContext:  ctx,
   221			im:          im,
   222			incremental: !forceFullImport && acctNode.Attr(importer.AcctAttrCompletedVersion) == runCompleteVersion,
   223	
   224			oauthClient: &oauth.Client{
   225				TemporaryCredentialRequestURI: temporaryCredentialRequestURL,
   226				ResourceOwnerAuthorizationURI: resourceOwnerAuthorizationURL,
   227				TokenRequestURI:               tokenRequestURL,
   228				Credentials: oauth.Credentials{
   229					Token:  clientId,
   230					Secret: secret,
   231				},
   232			},
   233			accessCreds: &oauth.Credentials{
   234				Token:  accessToken,
   235				Secret: accessSecret,
   236			},
   237		}
   238	
   239		userID := acctNode.Attr(importer.AcctAttrUserID)
   240		if userID == "" {
   241			return errors.New("userID hasn't been set by account setup")
   242		}
   243	
   244		skipAPITweets, _ := strconv.ParseBool(os.Getenv("CAMLI_TWITTER_SKIP_API_IMPORT"))
   245		if !skipAPITweets {
   246			if err := r.importTweets(userID, userTimeLineAPIPath); err != nil {
   247				return err
   248			}
   249		}
   250	
   251		acctNode, err = ctx.Host.ObjectFromRef(acctNode.PermanodeRef())
   252		if err != nil {
   253			return fmt.Errorf("error reloading account node: %v", err)
   254		}
   255		importLikes, err := strconv.ParseBool(acctNode.Attr(acctAttrImportLikes))
   256		if err == nil && importLikes {
   257			if err := r.importTweets(userID, userLikesAPIPath); err != nil {
   258				return err
   259			}
   260		}
   261	
   262		zipRef := acctNode.Attr(acctAttrTweetZip)
   263		zipDoneVal := zipRef + ":" + runCompleteVersion
   264		if zipRef != "" && !(r.incremental && acctNode.Attr(acctAttrZipDoneVersion) == zipDoneVal) {
   265			zipbr, ok := blob.Parse(zipRef)
   266			if !ok {
   267				return fmt.Errorf("invalid zip file blobref %q", zipRef)
   268			}
   269			fr, err := schema.NewFileReader(r.Context(), r.Host.BlobSource(), zipbr)
   270			if err != nil {
   271				return fmt.Errorf("error opening zip %v: %v", zipbr, err)
   272			}
   273			defer fr.Close()
   274			zr, err := zip.NewReader(fr, fr.Size())
   275			if err != nil {
   276				return fmt.Errorf("Error opening twitter zip file %v: %v", zipRef, err)
   277			}
   278			if err := r.importTweetsFromZip(userID, zr); err != nil {
   279				return err
   280			}
   281			if err := acctNode.SetAttrs(acctAttrZipDoneVersion, zipDoneVal); err != nil {
   282				return err
   283			}
   284		}
   285	
   286		r.mu.Lock()
   287		anyErr := r.anyErr
   288		r.mu.Unlock()
   289	
   290		if !anyErr {
   291			if err := acctNode.SetAttrs(importer.AcctAttrCompletedVersion, runCompleteVersion); err != nil {
   292				return err
   293			}
   294		}
   295	
   296		return nil
   297	}
   298	
   299	var _ importer.LongPoller = (*imp)(nil)
   300	
   301	func (im *imp) LongPoll(rctx *importer.RunContext) error {
   302		clientId, secret, err := rctx.Credentials()
   303		if err != nil {
   304			return err
   305		}
   306	
   307		acctNode := rctx.AccountNode()
   308		accessToken := acctNode.Attr(importer.AcctAttrAccessToken)
   309		accessSecret := acctNode.Attr(importer.AcctAttrAccessTokenSecret)
   310		if accessToken == "" || accessSecret == "" {
   311			return errors.New("access credentials not found")
   312		}
   313		oauthClient := &oauth.Client{
   314			TemporaryCredentialRequestURI: temporaryCredentialRequestURL,
   315			ResourceOwnerAuthorizationURI: resourceOwnerAuthorizationURL,
   316			TokenRequestURI:               tokenRequestURL,
   317			Credentials: oauth.Credentials{
   318				Token:  clientId,
   319				Secret: secret,
   320			},
   321		}
   322		accessCreds := &oauth.Credentials{
   323			Token:  accessToken,
   324			Secret: accessSecret,
   325		}
   326	
   327		form := url.Values{"with": {"user"}}
   328		req, _ := http.NewRequest("GET", "https://userstream.twitter.com/1.1/user.json", nil)
   329		req.Header.Set("Authorization", oauthClient.AuthorizationHeader(accessCreds, "GET", req.URL, form))
   330		req.URL.RawQuery = form.Encode()
   331		req.Cancel = rctx.Context().Done()
   332	
   333		log.Printf("twitter: beginning long poll, awaiting new tweets...")
   334		res, err := http.DefaultClient.Do(req)
   335		if err != nil {
   336			return err
   337		}
   338		defer res.Body.Close()
   339		if res.StatusCode != 200 {
   340			return errors.New(res.Status)
   341		}
   342		bs := bufio.NewScanner(res.Body)
   343		for bs.Scan() {
   344			line := strings.TrimSpace(bs.Text())
   345			if line == "" || strings.HasPrefix(line, `{"friends`) {
   346				continue
   347			}
   348			log.Printf("twitter: long poll saw activity")
   349			return nil
   350		}
   351		if err := bs.Err(); err != nil {
   352			return err
   353		}
   354		return errors.New("twitter: got EOF without a tweet")
   355	}
   356	
   357	func (r *run) errorf(format string, args ...interface{}) {
   358		log.Printf("twitter: "+format, args...)
   359		r.mu.Lock()
   360		defer r.mu.Unlock()
   361		r.anyErr = true
   362	}
   363	
   364	func (r *run) doAPI(result interface{}, apiPath string, keyval ...string) error {
   365		return importer.OAuthContext{
   366			Ctx:    r.Context(),
   367			Client: r.oauthClient,
   368			Creds:  r.accessCreds,
   369		}.PopulateJSONFromURL(result, http.MethodGet, apiURL+apiPath, keyval...)
   370	}
   371	
   372	// importTweets imports the tweets related to userID, through apiPath.
   373	// If apiPath is userTimeLineAPIPath, the tweets and retweets posted by userID are imported.
   374	// If apiPath is userLikesAPIPath, the tweets liked by userID are imported.
   375	func (r *run) importTweets(userID string, apiPath string) error {
   376		maxId := ""
   377		continueRequests := true
   378	
   379		var tweetsNode *importer.Object
   380		var err error
   381		var importType string
   382		if apiPath == userLikesAPIPath {
   383			importType = "likes"
   384		} else {
   385			importType = "tweets"
   386		}
   387		tweetsNode, err = r.getTopLevelNode(importType)
   388		if err != nil {
   389			return err
   390		}
   391	
   392		numTweets := 0
   393		sawTweet := map[string]bool{}
   394	
   395		// If attrs is changed, so should the expected responses accordingly for the
   396		// RoundTripper of MakeTestData (testdata.go).
   397		attrs := []string{
   398			"user_id", userID,
   399			"count", strconv.Itoa(tweetRequestLimit),
   400		}
   401		for continueRequests {
   402			select {
   403			case <-r.Context().Done():
   404				r.errorf("interrupted")
   405				return r.Context().Err()
   406			default:
   407			}
   408	
   409			var resp []*apiTweetItem
   410			var err error
   411			if maxId == "" {
   412				log.Printf("twitter: fetching %s for userid %s", importType, userID)
   413				err = r.doAPI(&resp, apiPath, attrs...)
   414			} else {
   415				log.Printf("twitter: fetching %s for userid %s with max ID %s", userID, importType, maxId)
   416				err = r.doAPI(&resp, apiPath,
   417					append(attrs, "max_id", maxId)...)
   418			}
   419			if err != nil {
   420				return err
   421			}
   422	
   423			var (
   424				newThisBatch = 0
   425				allDupMu     sync.Mutex
   426				allDups      = true
   427				gate         = syncutil.NewGate(tweetsAtOnce)
   428				grp          syncutil.Group
   429			)
   430			for i := range resp {
   431				tweet := resp[i]
   432	
   433				// Dup-suppression.
   434				if sawTweet[tweet.Id] {
   435					continue
   436				}
   437				sawTweet[tweet.Id] = true
   438				newThisBatch++
   439				maxId = tweet.Id
   440	
   441				gate.Start()
   442				grp.Go(func() error {
   443					defer gate.Done()
   444					dup, err := r.importTweet(tweetsNode, tweet, true)
   445					if !dup {
   446						allDupMu.Lock()
   447						allDups = false
   448						allDupMu.Unlock()
   449					}
   450					if err != nil {
   451						r.errorf("error importing tweet %s %v", tweet.Id, err)
   452					}
   453					return err
   454				})
   455			}
   456			if err := grp.Err(); err != nil {
   457				return err
   458			}
   459			numTweets += newThisBatch
   460			log.Printf("twitter: imported %d %s this batch; %d total.", newThisBatch, importType, numTweets)
   461			if r.incremental && allDups {
   462				log.Printf("twitter: incremental import found end batch")
   463				break
   464			}
   465			continueRequests = newThisBatch > 0
   466		}
   467		log.Printf("twitter: successfully did full run of importing %d %s", numTweets, importType)
   468		return nil
   469	}
   470	
   471	func tweetsFromZipFile(zf *zip.File) (tweets []*zipTweetItem, err error) {
   472		rc, err := zf.Open()
   473		if err != nil {
   474			return nil, err
   475		}
   476		slurp, err := io.ReadAll(rc)
   477		rc.Close()
   478		if err != nil {
   479			return nil, err
   480		}
   481		i := bytes.IndexByte(slurp, '[')
   482		if i < 0 {
   483			return nil, errors.New("No '[' found in zip file")
   484		}
   485		slurp = slurp[i:]
   486		if err := json.Unmarshal(slurp, &tweets); err != nil {
   487			return nil, fmt.Errorf("JSON error: %v", err)
   488		}
   489		return
   490	}
   491	
   492	func (r *run) importTweetsFromZip(userID string, zr *zip.Reader) error {
   493		log.Printf("twitter: processing zip file with %d files", len(zr.File))
   494	
   495		tweetsNode, err := r.getTopLevelNode("tweets")
   496		if err != nil {
   497			return err
   498		}
   499	
   500		var (
   501			gate = syncutil.NewGate(tweetsAtOnce)
   502			grp  syncutil.Group
   503		)
   504		total := 0
   505		for _, zf := range zr.File {
   506			if !(strings.HasPrefix(zf.Name, "data/js/tweets/2") && strings.HasSuffix(zf.Name, ".js")) {
   507				continue
   508			}
   509			tweets, err := tweetsFromZipFile(zf)
   510			if err != nil {
   511				return fmt.Errorf("error reading tweets from %s: %v", zf.Name, err)
   512			}
   513	
   514			for i := range tweets {
   515				total++
   516				tweet := tweets[i]
   517				gate.Start()
   518				grp.Go(func() error {
   519					defer gate.Done()
   520					_, err := r.importTweet(tweetsNode, tweet, false)
   521					return err
   522				})
   523			}
   524		}
   525		err = grp.Err()
   526		log.Printf("zip import of tweets: %d total, err = %v", total, err)
   527		return err
   528	}
   529	
   530	func timeParseFirstFormat(timeStr string, format ...string) (t time.Time, err error) {
   531		if len(format) == 0 {
   532			panic("need more than 1 format")
   533		}
   534		for _, f := range format {
   535			t, err = time.Parse(f, timeStr)
   536			if err == nil {
   537				break
   538			}
   539		}
   540		return
   541	}
   542	
   543	// viaAPI is true if it came via the REST API, or false if it came via a zip file.
   544	func (r *run) importTweet(parent *importer.Object, tweet tweetItem, viaAPI bool) (dup bool, err error) {
   545		select {
   546		case <-r.Context().Done():
   547			r.errorf("Twitter importer: interrupted")
   548			return false, r.Context().Err()
   549		default:
   550		}
   551		id := tweet.ID()
   552		tweetNode, err := parent.ChildPathObject(id)
   553		if err != nil {
   554			return false, err
   555		}
   556	
   557		// Because the zip format and the API format differ a bit, and
   558		// might diverge more in the future, never use the zip content
   559		// to overwrite data fetched via the API. If we add new
   560		// support for different fields in the future, we might want
   561		// to revisit this decision.  Be wary of flip/flopping data if
   562		// modifying this, though.
   563		if tweetNode.Attr(attrImportMethod) == "api" && !viaAPI {
   564			return true, nil
   565		}
   566	
   567		// e.g. "2014-06-12 19:11:51 +0000"
   568		createdTime, err := timeParseFirstFormat(tweet.CreatedAt(), time.RubyDate, "2006-01-02 15:04:05 -0700")
   569		if err != nil {
   570			return false, fmt.Errorf("could not parse time %q: %v", tweet.CreatedAt(), err)
   571		}
   572	
   573		url := fmt.Sprintf("https://twitter.com/%s/status/%v",
   574			r.AccountNode().Attr(importer.AcctAttrUserName),
   575			id)
   576	
   577		nodeType := nodeTypeTweet
   578		if tweet.Liked() {
   579			nodeType = nodeTypeLike
   580		}
   581	
   582		attrs := []string{
   583			"twitterId", id,
   584			nodeattr.Type, nodeType,
   585			nodeattr.StartDate, schema.RFC3339FromTime(createdTime),
   586			nodeattr.Content, tweet.Text(),
   587			nodeattr.URL, url,
   588		}
   589		if lat, long, ok := tweet.LatLong(); ok {
   590			attrs = append(attrs,
   591				nodeattr.Latitude, fmt.Sprint(lat),
   592				nodeattr.Longitude, fmt.Sprint(long),
   593			)
   594		}
   595		if viaAPI {
   596			attrs = append(attrs, attrImportMethod, "api")
   597		} else {
   598			attrs = append(attrs, attrImportMethod, "zip")
   599		}
   600	
   601		for i, m := range tweet.Media() {
   602			filename := m.BaseFilename()
   603			if tweetNode.Attr("camliPath:"+filename) != "" && (i > 0 || tweetNode.Attr("camliContentImage") != "") {
   604				// Don't re-import media we've already fetched.
   605				continue
   606			}
   607			tried, gotMedia := 0, false
   608			for _, mediaURL := range m.URLs() {
   609				tried++
   610				res, err := ctxutil.Client(r.Context()).Get(mediaURL)
   611				if err != nil {
   612					return false, fmt.Errorf("Error fetching %s for tweet %s : %v", mediaURL, url, err)
   613				}
   614				if res.StatusCode == http.StatusNotFound {
   615					continue
   616				}
   617				if res.StatusCode != 200 {
   618					return false, fmt.Errorf("HTTP status %d fetching %s for tweet %s", res.StatusCode, mediaURL, url)
   619				}
   620				if !viaAPI {
   621					log.Printf("twitter: for zip tweet %s, reading %v", url, mediaURL)
   622				}
   623				fileRef, err := schema.WriteFileFromReader(r.Context(), r.Host.Target(), filename, res.Body)
   624				res.Body.Close()
   625				if err != nil {
   626					return false, fmt.Errorf("Error fetching media %s for tweet %s: %v", mediaURL, url, err)
   627				}
   628				attrs = append(attrs, "camliPath:"+filename, fileRef.String())
   629				if i == 0 {
   630					attrs = append(attrs, "camliContentImage", fileRef.String())
   631				}
   632				log.Printf("twitter: slurped %s as %s for tweet %s (%v)", mediaURL, fileRef.String(), url, tweetNode.PermanodeRef())
   633				gotMedia = true
   634				break
   635			}
   636			if !gotMedia && tried > 0 {
   637				return false, fmt.Errorf("All media URLs 404s for tweet %s", url)
   638			}
   639		}
   640	
   641		changes, err := tweetNode.SetAttrs2(attrs...)
   642		if err == nil && changes {
   643			log.Printf("twitter: imported tweet %s", url)
   644		}
   645		return !changes, err
   646	}
   647	
   648	// path may be of: "tweets". (TODO: "lists", "direct_messages", etc.)
   649	func (r *run) getTopLevelNode(path string) (*importer.Object, error) {
   650		acctNode := r.AccountNode()
   651	
   652		root := r.RootNode()
   653		rootTitle := fmt.Sprintf("%s's Twitter Data", acctNode.Attr(importer.AcctAttrUserName))
   654		if err := root.SetAttr(nodeattr.Title, rootTitle); err != nil {
   655			return nil, err
   656		}
   657	
   658		obj, err := root.ChildPathObject(path)
   659		if err != nil {
   660			return nil, err
   661		}
   662		var title string
   663		switch path {
   664		case "tweets":
   665			title = fmt.Sprintf("%s's Tweets", acctNode.Attr(importer.AcctAttrUserName))
   666		case "likes":
   667			title = fmt.Sprintf("%s's Likes", acctNode.Attr(importer.AcctAttrUserName))
   668		}
   669		return obj, obj.SetAttr(nodeattr.Title, title)
   670	}
   671	
   672	type userInfo struct {
   673		ID         string `json:"id_str"`
   674		ScreenName string `json:"screen_name"`
   675		Name       string `json:"name,omitempty"`
   676	}
   677	
   678	func getUserInfo(ctx importer.OAuthContext) (userInfo, error) {
   679		var ui userInfo
   680		if err := ctx.PopulateJSONFromURL(&ui, http.MethodGet, apiURL+userInfoAPIPath); err != nil {
   681			return ui, err
   682		}
   683		if ui.ID == "" {
   684			return ui, fmt.Errorf("No userid returned")
   685		}
   686		return ui, nil
   687	}
   688	
   689	func (im *imp) ServeSetup(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) error {
   690		oauthClient, err := ctx.NewOAuthClient(oAuthURIs)
   691		if err != nil {
   692			err = fmt.Errorf("error getting OAuth client: %v", err)
   693			httputil.ServeError(w, r, err)
   694			return err
   695		}
   696		tempCred, err := oauthClient.RequestTemporaryCredentials(ctxutil.Client(ctx), ctx.CallbackURL(), nil)
   697		if err != nil {
   698			err = fmt.Errorf("Error getting temp cred: %v", err)
   699			httputil.ServeError(w, r, err)
   700			return err
   701		}
   702		if err := ctx.AccountNode.SetAttrs(
   703			importer.AcctAttrTempToken, tempCred.Token,
   704			importer.AcctAttrTempSecret, tempCred.Secret,
   705		); err != nil {
   706			err = fmt.Errorf("Error saving temp creds: %v", err)
   707			httputil.ServeError(w, r, err)
   708			return err
   709		}
   710	
   711		authURL := oauthClient.AuthorizationURL(tempCred, nil)
   712		http.Redirect(w, r, authURL, http.StatusFound)
   713		return nil
   714	}
   715	
   716	func (im *imp) ServeCallback(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) {
   717		tempToken := ctx.AccountNode.Attr(importer.AcctAttrTempToken)
   718		tempSecret := ctx.AccountNode.Attr(importer.AcctAttrTempSecret)
   719		if tempToken == "" || tempSecret == "" {
   720			log.Printf("twitter: no temp creds in callback")
   721			httputil.BadRequestError(w, "no temp creds in callback")
   722			return
   723		}
   724		if tempToken != r.FormValue("oauth_token") {
   725			log.Printf("twitter: unexpected oauth_token: got %v, want %v", r.FormValue("oauth_token"), tempToken)
   726			httputil.BadRequestError(w, "unexpected oauth_token")
   727			return
   728		}
   729		oauthClient, err := ctx.NewOAuthClient(oAuthURIs)
   730		if err != nil {
   731			err = fmt.Errorf("error getting OAuth client: %v", err)
   732			httputil.ServeError(w, r, err)
   733			return
   734		}
   735		tokenCred, vals, err := oauthClient.RequestToken(
   736			ctxutil.Client(ctx),
   737			&oauth.Credentials{
   738				Token:  tempToken,
   739				Secret: tempSecret,
   740			},
   741			r.FormValue("oauth_verifier"),
   742		)
   743		if err != nil {
   744			httputil.ServeError(w, r, fmt.Errorf("Error getting request token: %v ", err))
   745			return
   746		}
   747		userid := vals.Get("user_id")
   748		if userid == "" {
   749			httputil.ServeError(w, r, fmt.Errorf("Couldn't get user id: %v", err))
   750			return
   751		}
   752		if err := ctx.AccountNode.SetAttrs(
   753			importer.AcctAttrAccessToken, tokenCred.Token,
   754			importer.AcctAttrAccessTokenSecret, tokenCred.Secret,
   755		); err != nil {
   756			httputil.ServeError(w, r, fmt.Errorf("Error setting token attributes: %v", err))
   757			return
   758		}
   759	
   760		u, err := getUserInfo(importer.OAuthContext{Ctx: ctx.Context, Client: oauthClient, Creds: tokenCred})
   761		if err != nil {
   762			httputil.ServeError(w, r, fmt.Errorf("Couldn't get user info: %v", err))
   763			return
   764		}
   765		if err := ctx.AccountNode.SetAttrs(
   766			importer.AcctAttrUserID, u.ID,
   767			importer.AcctAttrName, u.Name,
   768			importer.AcctAttrUserName, u.ScreenName,
   769			nodeattr.Title, fmt.Sprintf("%s's Twitter Account", u.ScreenName),
   770		); err != nil {
   771			httputil.ServeError(w, r, fmt.Errorf("Error setting attribute: %v", err))
   772			return
   773		}
   774		http.Redirect(w, r, ctx.AccountURL(), http.StatusFound)
   775	}
   776	
   777	type tweetItem interface {
   778		ID() string
   779		LatLong() (lat, long float64, ok bool)
   780		CreatedAt() string
   781		Text() string
   782		Media() []tweetMedia
   783		Liked() bool
   784	}
   785	
   786	type tweetMedia interface {
   787		URLs() []string // use first non-404 one
   788		BaseFilename() string
   789	}
   790	
   791	type apiTweetItem struct {
   792		Id           string   `json:"id_str"`
   793		TextStr      string   `json:"text"`
   794		CreatedAtStr string   `json:"created_at"`
   795		Entities     entities `json:"entities"`
   796		Favorited    bool     `json:"favorited"`
   797	
   798		// One or both might be present:
   799		Geo         *geo    `json:"geo"`         // lat, long
   800		Coordinates *coords `json:"coordinates"` // geojson: long, lat
   801	}
   802	
   803	// zipTweetItem is like apiTweetItem, but twitter is annoying and the schema for the JSON inside zip files is slightly different.
   804	type zipTweetItem struct {
   805		Id           string `json:"id_str"`
   806		TextStr      string `json:"text"`
   807		CreatedAtStr string `json:"created_at"`
   808	
   809		// One or both might be present:
   810		Geo         *geo        `json:"geo"`         // lat, long
   811		Coordinates *coords     `json:"coordinates"` // geojson: long, lat
   812		Entities    zipEntities `json:"entities"`
   813	}
   814	
   815	func (t *apiTweetItem) ID() string {
   816		if t.Id == "" {
   817			panic("empty id")
   818		}
   819		return t.Id
   820	}
   821	
   822	func (t *zipTweetItem) ID() string {
   823		if t.Id == "" {
   824			panic("empty id")
   825		}
   826		return t.Id
   827	}
   828	
   829	func (t *apiTweetItem) CreatedAt() string { return t.CreatedAtStr }
   830	func (t *zipTweetItem) CreatedAt() string { return t.CreatedAtStr }
   831	
   832	func (t *apiTweetItem) Text() string { return html.UnescapeString(t.TextStr) }
   833	func (t *zipTweetItem) Text() string { return html.UnescapeString(t.TextStr) }
   834	
   835	func (t *apiTweetItem) LatLong() (lat, long float64, ok bool) {
   836		return latLong(t.Geo, t.Coordinates)
   837	}
   838	
   839	func (t *zipTweetItem) LatLong() (lat, long float64, ok bool) {
   840		return latLong(t.Geo, t.Coordinates)
   841	}
   842	
   843	func latLong(g *geo, c *coords) (lat, long float64, ok bool) {
   844		if g != nil && len(g.Coordinates) == 2 {
   845			co := g.Coordinates
   846			if co[0] != 0 && co[1] != 0 {
   847				return co[0], co[1], true
   848			}
   849		}
   850		if c != nil && len(c.Coordinates) == 2 {
   851			co := c.Coordinates
   852			if co[0] != 0 && co[1] != 0 {
   853				return co[1], co[0], true
   854			}
   855		}
   856		return
   857	}
   858	
   859	func (t *zipTweetItem) Media() (ret []tweetMedia) {
   860		for _, m := range t.Entities.Media {
   861			ret = append(ret, m)
   862		}
   863		ret = append(ret, getImagesFromURLs(t.Entities.URLs)...)
   864		return
   865	}
   866	
   867	func (t *apiTweetItem) Media() (ret []tweetMedia) {
   868		for _, m := range t.Entities.Media {
   869			ret = append(ret, m)
   870		}
   871		ret = append(ret, getImagesFromURLs(t.Entities.URLs)...)
   872		return
   873	}
   874	
   875	func (t *apiTweetItem) Liked() bool { return t.Favorited }
   876	func (t *zipTweetItem) Liked() bool { return false }
   877	
   878	type geo struct {
   879		Coordinates []float64 `json:"coordinates"` // lat,long
   880	}
   881	
   882	type coords struct {
   883		Coordinates []float64 `json:"coordinates"` // long,lat
   884	}
   885	
   886	type entities struct {
   887		Media []*media     `json:"media"`
   888		URLs  []*urlEntity `json:"urls"`
   889	}
   890	
   891	type zipEntities struct {
   892		Media []*zipMedia  `json:"media"`
   893		URLs  []*urlEntity `json:"urls"`
   894	}
   895	
   896	//	e.g.  {
   897	//	  "indices" : [ 105, 125 ],
   898	//	  "url" : "http:\/\/t.co\/gbGO8Qep",
   899	//	  "expanded_url" : "http:\/\/twitpic.com\/6mdqac",
   900	//	  "display_url" : "twitpic.com\/6mdqac"
   901	//	}
   902	type urlEntity struct {
   903		URL         string `json:"url"`
   904		ExpandedURL string `json:"expanded_url"`
   905		DisplayURL  string `json:"display_url"`
   906	}
   907	
   908	var imgurRx = regexp.MustCompile(`\bimgur\.com/(\w\w\w+)`)
   909	
   910	func getImagesFromURLs(urls []*urlEntity) (ret []tweetMedia) {
   911		// TODO: extract these regexps from tweet text too. Happens in
   912		// a few cases I've seen in my history.
   913		for _, u := range urls {
   914			if strings.HasPrefix(u.DisplayURL, "twitpic.com") {
   915				ret = append(ret, twitpicImage(strings.TrimPrefix(u.DisplayURL, "twitpic.com/")))
   916				continue
   917			}
   918			if m := imgurRx.FindStringSubmatch(u.DisplayURL); m != nil {
   919				ret = append(ret, imgurImage(m[1]))
   920				continue
   921			}
   922		}
   923		return
   924	}
   925	
   926	// The Media entity from the Rest API. See also: zipMedia.
   927	type media struct {
   928		Id            string               `json:"id_str"`
   929		IdNum         int64                `json:"id"`
   930		MediaURL      string               `json:"media_url"`
   931		MediaURLHTTPS string               `json:"media_url_https"`
   932		Sizes         map[string]mediaSize `json:"sizes"`
   933		Type_         string               `json:"type"`
   934	}
   935	
   936	// The Media entity from the zip file JSON. Similar but different to
   937	// media. Thanks, Twitter.
   938	type zipMedia struct {
   939		Id            string      `json:"id_str"`
   940		IdNum         int64       `json:"id"`
   941		MediaURL      string      `json:"media_url"`
   942		MediaURLHTTPS string      `json:"media_url_https"`
   943		Sizes         []mediaSize `json:"sizes"` // without a key! useless.
   944	}
   945	
   946	func (m *media) URLs() []string {
   947		u := m.baseURL()
   948		if u == "" {
   949			return nil
   950		}
   951		return []string{u + m.largestMediaSuffix(), u}
   952	}
   953	
   954	func (m *zipMedia) URLs() []string {
   955		// We don't get any suffix names, so just try some common
   956		// ones. The first non-404 will be used:
   957		u := m.baseURL()
   958		if u == "" {
   959			return nil
   960		}
   961		return []string{
   962			u + ":large",
   963			u,
   964		}
   965	}
   966	
   967	func (m *media) baseURL() string {
   968		if v := m.MediaURLHTTPS; v != "" {
   969			return v
   970		}
   971		return m.MediaURL
   972	}
   973	
   974	func (m *zipMedia) baseURL() string {
   975		if v := m.MediaURLHTTPS; v != "" {
   976			return v
   977		}
   978		return m.MediaURL
   979	}
   980	
   981	func (m *media) BaseFilename() string {
   982		return path.Base(m.baseURL())
   983	}
   984	
   985	func (m *zipMedia) BaseFilename() string {
   986		return path.Base(m.baseURL())
   987	}
   988	
   989	func (m *media) largestMediaSuffix() string {
   990		bestPixels := 0
   991		bestSuffix := ""
   992		for k, sz := range m.Sizes {
   993			if px := sz.W * sz.H; px > bestPixels {
   994				bestPixels = px
   995				bestSuffix = ":" + k
   996			}
   997		}
   998		return bestSuffix
   999	}
  1000	
  1001	type mediaSize struct {
  1002		W      int    `json:"w"`
  1003		H      int    `json:"h"`
  1004		Resize string `json:"resize"`
  1005	}
  1006	
  1007	// An image from twitpic.
  1008	type twitpicImage string
  1009	
  1010	func (im twitpicImage) BaseFilename() string { return string(im) }
  1011	
  1012	func (im twitpicImage) URLs() []string {
  1013		return []string{"https://twitpic.com/show/large/" + string(im)}
  1014	}
  1015	
  1016	// An image from imgur
  1017	type imgurImage string
  1018	
  1019	func (im imgurImage) BaseFilename() string { return string(im) }
  1020	
  1021	func (im imgurImage) URLs() []string {
  1022		// Imgur ignores the suffix if it's .gif, .png, or .jpg. So just pick .gif.
  1023		// The actual content will be returned.
  1024		return []string{"https://i.imgur.com/" + string(im) + ".gif"}
  1025	}
Website layout inspired by memcached.
Content by the authors.