Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

In this assignment we are going to work with a larger collection of tweets (10,0

ID: 3843698 • Letter: I

Question

In this assignment we are going to work with a larger collection of tweets (10,000) that are available here:
http://rasinsrv07.cstcis.cti.depaul.edu/CSC455/Assignment5.txt

a.Write and execute a SQL query to do the following: Find the user (“id” and “name”) with the minimum “friend_count” in the database


b. Write python code that is going to perform the same computation (find the user with the minimum “friend_count”)


c. Write and execute SQL query to do the following: Find the tweets without associated geo entry (hint: it should involve a NULL).


d. Write python code that is going to perform the same computation as 2-c.

e. Write and execute SQL query that finds the longest and the shortest tweet text message (if there is a tie, you must return all shortest and longest tweet messages, not just one).

Explanation / Answer

a.

c.

from tweepy import Stream    import sys import json import dateutil.parser from pytz import timezone import pytz # The consumer keys can be found on your application's Details # page located at https://dev.twitter.com/apps (under "OAuth settings") CONSUMER_KEY = 'XXXXXXXXXXXXXXXXXXXXXX' CONSUMER_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' # The access tokens can be found on your applications's Details # page located at https://dev.twitter.com/apps (located # under "Your access token") ACCESS_TOKEN = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' ACCESS_TOKEN_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' sgtz = timezone('Asia/Singapore') utc = pytz.timezone('UTC') STATIONS = [ 'Admiralty MRT', 'Aljunied MRT', 'Ang Mo Kio MRT', 'Bartley MRT', 'Bayfront MRT', 'Bedok MRT', 'Bishan MRT', 'Bras Basah MRT', 'Botanic Gardens MRT', 'Braddell MRT', 'Bukit Batok MRT', 'Bukit Gombak MRT', 'Caldecott MRT', 'Choa Chu Kang MRT', 'Boon Keng MRT', 'Boon Lay MRT', 'Buangkok MRT', 'Bugis MRT', 'Buona Vista MRT', 'Changi Airport MRT', 'Chinatown MRT', 'Clarke Quay MRT', 'Chinese Garden MRT', 'City Hall MRT', 'Clementi MRT', 'Commonwealth MRT', 'Dakota MRT', 'Dhoby Ghaut MRT', 'Dover MRT', 'Esplanade MRT', 'Eunos MRT', 'Expo MRT', 'Farrer Park MRT', 'Farrer Road MRT', 'HarbourFront MRT', 'Haw Par Villa MRT', 'Holland Village MRT', 'Hougang MRT', 'Joo Koon MRT', 'Jurong East MRT', 'Kallang MRT', 'Kovan MRT', 'Kembangan MRT', 'Kent Ridge MRT', 'Khatib MRT', 'Kranji MRT', 'Lakeside MRT', 'Labrador Park MRT', 'Lavender MRT', 'Little India MRT', 'Lorong Chuan MRT', 'Marina Bay MRT', 'Marsiling MRT', 'MacPherson MRT', 'Marymount MRT', 'Mountbatten MRT', 'Newton MRT', 'Nicoll Highway MRT', 'one-north MRT', 'Novena MRT', 'Orchard MRT', 'Outram Park MRT', 'Pasir Ris MRT', 'Pasir Panjang MRT', 'Paya Lebar MRT', 'Pioneer MRT', 'Potong Pasir MRT', 'Promenade MRT', 'Punggol MRT', 'Queenstown MRT', 'Raffles Place MRT', 'Redhill MRT', 'Sembawang MRT', 'Sengkang MRT', 'Serangoon MRT', 'Simei MRT', 'Somerset MRT', 'Stadium MRT', 'Tampines MRT', 'Tai Seng MRT', 'Tanah Merah MRT', 'Tanjong Pagar MRT', 'Tiong Bahru MRT', 'Telok Blangah MRT', 'Toa Payoh MRT', 'Woodlands MRT', 'Woodleigh MRT', 'Yew Tree MRT', 'Yio Chu Kang MRT', 'Yishun MRT' ] regex = re.compile('|'.join(STATIONS).lower()) linenum_re = re.compile(r'([A-Z][A-Z]d+)') retweets_re = re.compile(r'^RTs') enc = lambda x: x.encode('latin1', errors='ignore') class StdOutListener(StreamListener): def on_data(self, data): tweet = json.loads(data) if not tweet.has_key('user'): print 'No user data - ignoring tweet.' return True user = enc(tweet['user']['name']) text = enc(tweet['text']) # ignore text that doesn't contain one of the keywords matches = re.search(regex, text.lower()) if not matches: return True # ignore retweets if re.search(retweets_re, text): return True location = enc(tweet['user']['location']) source = enc(tweet['source']) d = dateutil.parser.parse(enc(tweet['created_at'])) # localize time d_tz = utc.normalize(d) localtime = d.astimezone(sgtz) tmstr = localtime.strftime("%Y%m%d-%H:%M:%S") # append the hourly tweet file with open('tweets-%s.data' % tmstr.split(':')[0], 'a+') as f: f.write(data) # is this a geocoded tweet? geo = tweet['geo'] if geo and geo['type'] == 'Point': # collect location of mrt station coords = geo['coordinates'] ln = re.search(linenum_re, text) if ln: with open('mrt_station_locations.csv', 'a+') as mrtgeo: print("Found geo coords for MRT Station (%s) '%s': (%f, %f) " % (ln.group(), matches.group(), coords[1], coords[0])) mrtgeo.write("%f %f %s %s " % (coords[1], coords[0], matches.group(), ln.group())) # print summary of tweet print('%s %s %s %s %s ---------------- ' % (user, location, source, tmstr, text)) return True def on_error(self, status): print('status: %s' % status) if __name__ == '__main__': l = StdOutListener() auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) stream = Stream(auth, l, timeout=60) print("Listening to filter stream...") stream.filter(track=STATIONS)