Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed 5 failing tests due to list(set(... operations in ParseResults.__init__ #9

Open
wants to merge 39 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
82cf864
Removed list(set(...)) de-duplicate operations in ParseResults.__init__
ianozsvald Sep 7, 2012
71b793a
Applied schwa's span addition
ianozsvald Sep 7, 2012
ff5a0c0
added span tests as a separate class
ianozsvald Sep 7, 2012
a202185
not sure what happened, unittest.main() does the job now
ianozsvald Sep 7, 2012
90fbc84
added test for hash and comma in URL
ianozsvald Sep 9, 2012
b25880a
uncovered two name-shielded tests and renamed, now also using non-htm…
ianozsvald Sep 10, 2012
536ba80
removed off-by-one offset for URL and hashtag matcher if a pre charac…
ianozsvald Sep 10, 2012
a8c77dc
added reference to the original project
ianozsvald Sep 13, 2012
f309568
changed URL
ianozsvald Sep 13, 2012
be4d2e3
first
ianozsvald Sep 13, 2012
489ca04
preparing for V1.0.0 release
ianozsvald Feb 11, 2013
e2c57a5
weird formatting bug
ianozsvald Feb 11, 2013
2ae04ff
weird formatting bug
ianozsvald Feb 11, 2013
c8e40cd
weird formatting bug
ianozsvald Feb 11, 2013
77ff625
weird formatting bug
ianozsvald Feb 11, 2013
4297316
weird formatting bug
ianozsvald Feb 11, 2013
22c73a9
weird formatting bug
ianozsvald Feb 11, 2013
e2e3615
weird formatting bug
ianozsvald Feb 11, 2013
4b8121c
weird formatting bug
ianozsvald Feb 11, 2013
c024c58
weird formatting bug
ianozsvald Feb 11, 2013
9b86dc1
weird formatting bug
ianozsvald Feb 11, 2013
400758b
minor
ianozsvald Feb 11, 2013
bdf7316
version bump after fixing up setup.py to use a subdirectory
ianozsvald Feb 11, 2013
52c6101
Fix t.co urls followed by a comma
lsemel Mar 25, 2013
a9973f9
added some notes for TODO
ianozsvald Mar 26, 2013
19e2368
bump of version nbr for this new working version, added a shortlink f…
ianozsvald Mar 28, 2013
1bab751
added requirements
ianozsvald Mar 28, 2013
79df69f
Merge branch 'master' of github.com:muckrack/twitter-text-python into…
ianozsvald Apr 4, 2013
dd4e932
adding some , parsing
ianozsvald Apr 4, 2013
4b2d7a0
extra note on how to run tests
ianozsvald Apr 4, 2013
f80d89c
used autopep8 to clean up the src
ianozsvald Jun 1, 2013
93f6985
minor
ianozsvald Jun 1, 2013
e00cad8
notes on pypi release and git tagging
ianozsvald Jun 1, 2013
0724099
note on pushing tags
ianozsvald Jun 1, 2013
033a5ab
cleanup
ianozsvald Jun 1, 2013
66c209b
cleanup
ianozsvald Jun 1, 2013
aa6bf1a
cleanup
ianozsvald Jun 1, 2013
756f947
point to Ed for his support
ianozsvald Jul 28, 2014
13f4990
point to Ed for his support
ianozsvald Jul 28, 2014
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Removed list(set(...)) de-duplicate operations in ParseResults.__init__
as they destory the ordering of urls, users etc in the tweet.  The
list(set( operation on replies was dangerous as reply was a string not a
list (so the string was split into a list of set elements of
characters). Removed lots of non-pep8 whitespace
ianozsvald committed Sep 7, 2012
commit 82cf8641060725ccf5e4e00e6cc3b60191409e2c
218 changes: 109 additions & 109 deletions tests.py
Original file line number Diff line number Diff line change
@@ -24,525 +24,525 @@
class TWPTests(unittest.TestCase):
def setUp(self):
self.parser = ttp.Parser()


# General Tests ------------------------------------------------------------
# --------------------------------------------------------------------------
def test_all_not_allow_amp_without_question(self):
result = self.parser.parse(u'Check out: http://www.github.com/test&@username')
self.assertEqual(result.html, u'Check out: <a href="http://www.github.com/test">http://www.github.com/test</a>&<a href="http://twitter.com/username">@username</a>')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.urls, [u'http://www.github.com/test'])

def test_all_not_break_url_at(self):
result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/4382024406">http://www.flickr.com/photo...</a>')
self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])


# URL tests ----------------------------------------------------------------
# --------------------------------------------------------------------------
def test_url_mid(self):
result = self.parser.parse(u'text http://example.com more text')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a> more text')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_unicode(self):
result = self.parser.parse(u'I enjoy Macintosh Brand computers: http://✪df.ws/ejp')
self.assertEqual(result.html, u'I enjoy Macintosh Brand computers: <a href="http://✪df.ws/ejp">http://✪df.ws/ejp</a>')
self.assertEqual(result.urls, [u'http://\u272adf.ws/ejp'])

def test_url_parentheses(self):
result = self.parser.parse(u'text (http://example.com)')
self.assertEqual(result.html, u'text (<a href="http://example.com">http://example.com</a>)')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_underscore(self):
result = self.parser.parse(u'text http://example.com/test/foo_123.jpg')
self.assertEqual(result.html, u'text <a href="http://example.com/test/foo_123.jpg">http://example.com/test/foo...</a>')
self.assertEqual(result.urls, [u'http://example.com/test/foo_123.jpg'])

def test_url_underscore_dot(self):
result = self.parser.parse(u'text http://example.com/test/bla.net_foo_123.jpg')
self.assertEqual(result.html, u'text <a href="http://example.com/test/bla.net_foo_123.jpg">http://example.com/test/bla...</a>')
self.assertEqual(result.urls, [u'http://example.com/test/bla.net_foo_123.jpg'])

def test_url_amp_lang_equals(self):
result = self.parser.parse(u'Check out http://search.twitter.com/search?q=avro&lang=en')
self.assertEqual(result.html, u'Check out <a href="http://search.twitter.com/search?q=avro&amp;lang=en">http://search.twitter.com/s...</a>')
self.assertEqual(result.urls, [u'http://search.twitter.com/search?q=avro&lang=en'])

def test_url_amp_break(self):
result = self.parser.parse(u'Check out http://twitter.com/te?foo&invalid=True')
self.assertEqual(result.html, u'Check out <a href="http://twitter.com/te?foo&amp;invalid=True">http://twitter.com/te?foo...</a>')
self.assertEqual(result.urls, [u'http://twitter.com/te?foo&invalid=True'])

def test_url_dash(self):
result = self.parser.parse(u'Is www.foo-bar.com a valid URL?')
self.assertEqual(result.html, u'Is <a href="http://www.foo-bar.com">www.foo-bar.com</a> a valid URL?')
self.assertEqual(result.urls, [u'www.foo-bar.com'])

def test_url_multiple(self):
result = self.parser.parse(u'http://example.com https://sslexample.com http://sub.example.com')
self.assertEqual(result.html, u'<a href="http://example.com">http://example.com</a> <a href="https://sslexample.com">https://sslexample.com</a> <a href="http://sub.example.com">http://sub.example.com</a>')
self.assertEqual(result.urls, [u'http://example.com', u'https://sslexample.com', u'http://sub.example.com'])

def test_url_raw_domain(self):
result = self.parser.parse(u'See http://example.com example.com')
self.assertEqual(result.html, u'See <a href="http://example.com">http://example.com</a> example.com')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_embed_link(self):
result = self.parser.parse(u'<link rel=\'true\'>http://example.com</link>')
self.assertEqual(result.html, u'<link rel=\'true\'><a href="http://example.com">http://example.com</a></link>')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_trailing(self):
result = self.parser.parse(u'text http://example.com')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_japanese(self):
result = self.parser.parse(u'いまなにしてるhttp://example.comいまなにしてる')
self.assertEqual(result.html, u'いまなにしてる<a href="http://example.com">http://example.com</a>いまなにしてる')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_lots_of_punctuation(self):
result = self.parser.parse(u'text http://xo.com/~matthew+%-,.;x')
self.assertEqual(result.html, u'text <a href="http://xo.com/~matthew+%-,.;x">http://xo.com/~matthew+%-,.;x</a>')
self.assertEqual(result.urls, [u'http://xo.com/~matthew+%-,.;x'])

def test_url_question_numbers(self):
result = self.parser.parse(u'text http://example.com/?77e8fd')
self.assertEqual(result.html, u'text <a href="http://example.com/?77e8fd">http://example.com/?77e8fd</a>')
self.assertEqual(result.urls, [u'http://example.com/?77e8fd'])

def test_url_one_letter_other(self):
result = self.parser.parse(u'text http://u.nu/')
self.assertEqual(result.html, u'text <a href="http://u.nu/">http://u.nu/</a>')
self.assertEqual(result.urls, [u'http://u.nu/'])

result = self.parser.parse(u'text http://u.tv/')
self.assertEqual(result.html, u'text <a href="http://u.tv/">http://u.tv/</a>')
self.assertEqual(result.urls, [u'http://u.tv/'])

def test_url_one_letter_iana(self):
result = self.parser.parse(u'text http://x.com/')
self.assertEqual(result.html, u'text <a href="http://x.com/">http://x.com/</a>')
self.assertEqual(result.urls, [u'http://x.com/'])

result = self.parser.parse(u'text http://Q.com/')
self.assertEqual(result.html, u'text <a href="http://Q.com/">http://Q.com/</a>')
self.assertEqual(result.urls, [u'http://Q.com/'])

result = self.parser.parse(u'text http://z.com/')
self.assertEqual(result.html, u'text <a href="http://z.com/">http://z.com/</a>')
self.assertEqual(result.urls, [u'http://z.com/'])

result = self.parser.parse(u'text http://i.net/')
self.assertEqual(result.html, u'text <a href="http://i.net/">http://i.net/</a>')
self.assertEqual(result.urls, [u'http://i.net/'])

result = self.parser.parse(u'text http://q.net/')
self.assertEqual(result.html, u'text <a href="http://q.net/">http://q.net/</a>')
self.assertEqual(result.urls, [u'http://q.net/'])

result = self.parser.parse(u'text http://X.org/')
self.assertEqual(result.html, u'text <a href="http://X.org/">http://X.org/</a>')
self.assertEqual(result.urls, [u'http://X.org/'])

def test_url_long_hypens(self):
result = self.parser.parse(u'text http://word-and-a-number-8-ftw.domain.tld/')
self.assertEqual(result.html, u'text <a href="http://word-and-a-number-8-ftw.domain.tld/">http://word-and-a-number-8-...</a>')
self.assertEqual(result.urls, [u'http://word-and-a-number-8-ftw.domain.tld/'])


# URL not tests ------------------------------------------------------------
def test_not_url_dotdotdot(self):
result = self.parser.parse(u'Is www...foo a valid URL?')
self.assertEqual(result.html, u'Is www...foo a valid URL?')
self.assertEqual(result.urls, [])

def test_not_url_dash(self):
result = self.parser.parse(u'Is www.-foo.com a valid URL?')
self.assertEqual(result.html, u'Is www.-foo.com a valid URL?')
self.assertEqual(result.urls, [])

def test_not_url_no_tld(self):
result = self.parser.parse(u'Is http://no-tld a valid URL?')
self.assertEqual(result.html, u'Is http://no-tld a valid URL?')
self.assertEqual(result.urls, [])

def test_not_url_tld_too_short(self):
result = self.parser.parse(u'Is http://tld-too-short.x a valid URL?')
self.assertEqual(result.html, u'Is http://tld-too-short.x a valid URL?')
self.assertEqual(result.urls, [])

def test_all_not_break_url_at(self):
result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/4382024406">http://www.flickr.com/photo...</a>')
self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])

def test_not_url_one_letter_iana(self):
result = self.parser.parse(u'text http://a.com/ http://a.net/ http://a.org/')
self.assertEqual(result.html, u'text http://a.com/ http://a.net/ http://a.org/')
self.assertEqual(result.urls, [])


# URL followed Tests -------------------------------------------------------
def test_url_followed_question(self):
result = self.parser.parse(u'text http://example.com?')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>?')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_colon(self):
result = self.parser.parse(u'text http://example.com:')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>:')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_curly_brace(self):
result = self.parser.parse(u'text http://example.com}')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>}')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_single_quote(self):
result = self.parser.parse(u'text http://example.com')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_dot(self):
result = self.parser.parse(u'text http://example.com.')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>.')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_exclamation(self):
result = self.parser.parse(u'text http://example.com!')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>!')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_comma(self):
result = self.parser.parse(u'text http://example.com,')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>,')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_brace(self):
result = self.parser.parse(u'text http://example.com)')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>)')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_big_brace(self):
result = self.parser.parse(u'text http://example.com]')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>]')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_equals(self):
result = self.parser.parse(u'text http://example.com=')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>=')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_semicolon(self):
result = self.parser.parse(u'text http://example.com;')
self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>;')
self.assertEqual(result.urls, [u'http://example.com'])

def test_url_followed_hypen(self):
result = self.parser.parse(u'text http://domain.tld-that-you-should-have-put-a-space-after')
self.assertEqual(result.html, u'text <a href="http://domain.tld">http://domain.tld</a>-that-you-should-have-put-a-space-after')
self.assertEqual(result.urls, [u'http://domain.tld'])


# URL preceeded Tests -------------------------------------------------------
def test_url_preceeded_colon(self):
result = self.parser.parse(u'text:http://example.com')
self.assertEqual(result.html, u'text:<a href="http://example.com">http://example.com</a>')
self.assertEqual(result.urls, [u'http://example.com'])

def test_not_url_preceeded_equals(self):
result = self.parser.parse(u'text =http://example.com')
self.assertEqual(result.html, u'text =http://example.com')
self.assertEqual(result.urls, [])

# NOT
def test_not_url_preceeded_forwardslash(self):
result = self.parser.parse(u'text /http://example.com')
self.assertEqual(result.html, u'text /http://example.com')
self.assertEqual(result.urls, [])

def test_not_url_preceeded_exclamation(self):
result = self.parser.parse(u'text !http://example.com')
self.assertEqual(result.html, u'text !http://example.com')
self.assertEqual(result.urls, [])


# URL numeric tests --------------------------------------------------------
def test_url_at_numeric(self):
result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/4382024406">http://www.flickr.com/photo...</a>')
self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])

def test_url_at_non_numeric(self):
result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/foobar')
self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/foobar">http://www.flickr.com/photo...</a>')
self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/foobar'])


# URL domain tests ---------------------------------------------------------
def test_url_WWW(self):
result = self.parser.parse(u'WWW.EXAMPLE.COM')
self.assertEqual(result.html, u'<a href="http://WWW.EXAMPLE.COM">WWW.EXAMPLE.COM</a>')
self.assertEqual(result.urls, [u'WWW.EXAMPLE.COM'])

def test_url_www(self):
result = self.parser.parse(u'www.example.com')
self.assertEqual(result.html, u'<a href="http://www.example.com">www.example.com</a>')
self.assertEqual(result.urls, [u'www.example.com'])

def test_url_only_domain_query_followed_period(self):
result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.i.want.it. Even when they contain a URL.')
self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period <a href="http://tell.me/why?=because.i.want.it">http://tell.me/why?=because...</a>. Even when they contain a URL.')
self.assertEqual(result.urls, [u'http://tell.me/why?=because.i.want.it'])

def test_url_only_domain_followed_period(self):
result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me. Even when they contain a URL.')
self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period <a href="http://tell.me">http://tell.me</a>. Even when they contain a URL.')
self.assertEqual(result.urls, [u'http://tell.me'])

def test_url_only_domain_path_followed_period(self):
result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why. Even when they contain a URL.')
self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period <a href="http://tell.me/why">http://tell.me/why</a>. Even when they contain a URL.')
self.assertEqual(result.urls, [u'http://tell.me/why'])

def test_url_long_tld(self):
result = self.parser.parse(u'http://example.mobi/path')
self.assertEqual(result.html, u'<a href="http://example.mobi/path">http://example.mobi/path</a>')
self.assertEqual(result.urls, [u'http://example.mobi/path'])

def test_url_multiple_protocols(self):
result = self.parser.parse(u'http://foo.com AND https://bar.com AND www.foobar.com')
self.assertEqual(result.html, u'<a href="http://foo.com">http://foo.com</a> AND <a href="https://bar.com">https://bar.com</a> AND <a href="http://www.foobar.com">www.foobar.com</a>')
self.assertEqual(result.urls, [u'http://foo.com', u'https://bar.com', u'www.foobar.com'])

# NOT
def test_not_url_exclamation_domain(self):
result = self.parser.parse(u'badly formatted http://foo!bar.com')
self.assertEqual(result.html, u'badly formatted http://foo!bar.com')
self.assertEqual(result.urls, [])

def test_not_url_under_domain(self):
result = self.parser.parse(u'badly formatted http://foo_bar.com')
self.assertEqual(result.html, u'badly formatted http://foo_bar.com')
self.assertEqual(result.urls, [])


# Hashtag tests ------------------------------------------------------------
# --------------------------------------------------------------------------
def test_hashtag_followed_full_whitespace(self):
result = self.parser.parse(u'#hashtag text')
self.assertEqual(result.html, u'<a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a> text')
self.assertEqual(result.tags, [u'hashtag'])

def test_hashtag_followed_full_hash(self):
result = self.parser.parse(u'#hashtag')
self.assertEqual(result.html, u'<a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>')
self.assertEqual(result.tags, [u'hashtag'])

def test_hashtag_preceeded_full_whitespace(self):
result = self.parser.parse(u'text #hashtag')
self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>')
self.assertEqual(result.tags, [u'hashtag'])

def test_hashtag_number(self):
result = self.parser.parse(u'text #1tag')
self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%231tag">#1tag</a>')
self.assertEqual(result.tags, [u'1tag'])

def test_not_hashtag_escape(self):
result = self.parser.parse(u'&#nbsp;')
self.assertEqual(result.html, u'&#nbsp;')
self.assertEqual(result.tags, [])

def test_hashtag_japanese(self):
result = self.parser.parse(u'text #hashtagの')
self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>の')
self.assertEqual(result.tags, [u'hashtag'])

def test_hashtag_period(self):
result = self.parser.parse(u'text.#hashtag')
self.assertEqual(result.html, u'text.<a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>')
self.assertEqual(result.tags, [u'hashtag'])

def test_hashtag_trailing(self):
result = self.parser.parse(u'text #hashtag')
self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>')
self.assertEqual(result.tags, [u'hashtag'])

def test_not_hashtag_exclamation(self):
result = self.parser.parse(u'text #hashtag!')
self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>!')
self.assertEqual(result.tags, [u'hashtag'])

def test_hashtag_multiple(self):
result = self.parser.parse(u'text #hashtag1 #hashtag2')
self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hashtag1">#hashtag1</a> <a href="http://search.twitter.com/search?q=%23hashtag2">#hashtag2</a>')
self.assertEqual(result.tags, [u'hashtag1', u'hashtag2'])

def test_not_hashtag_number(self):
result = self.parser.parse(u'text #1234')
self.assertEqual(result.html, u'text #1234')
self.assertEqual(result.tags, [])

def test_not_hashtag_text(self):
result = self.parser.parse(u'text#hashtag')
self.assertEqual(result.html, u'text#hashtag')
self.assertEqual(result.tags, [])

def test_hashtag_umlaut(self):
result = self.parser.parse(u'text #hash_tagüäö')
self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hash_tag%C3%BC%C3%A4%C3%B6">#hash_tagüäö</a>')
self.assertEqual(result.tags, [u'hash_tag\xfc\xe4\xf6'])

def test_hashtag_alpha(self):
result = self.parser.parse(u'text #hash0tag')
self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hash0tag">#hash0tag</a>')
self.assertEqual(result.tags, [u'hash0tag'])

def test_hashtag_under(self):
result = self.parser.parse(u'text #hash_tag')
self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hash_tag">#hash_tag</a>')
self.assertEqual(result.tags, [u'hash_tag'])


# Username tests -----------------------------------------------------------
# --------------------------------------------------------------------------
def test_not_username_preceded_letter(self):
result = self.parser.parse(u'meet@the beach')
self.assertEqual(result.html, u'meet@the beach')
self.assertEqual(result.users, [])

def test_username_preceded_punctuation(self):
result = self.parser.parse(u'.@username')
self.assertEqual(result.html, u'.<a href="http://twitter.com/username">@username</a>')
self.assertEqual(result.users, [u'username'])

def test_username_preceded_japanese(self):
result = self.parser.parse(u'あ@username')
self.assertEqual(result.html, u'あ<a href="http://twitter.com/username">@username</a>')
self.assertEqual(result.users, [u'username'])

def test_username_followed_japanese(self):
result = self.parser.parse(u'@usernameの')
self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a>の')
self.assertEqual(result.users, [u'username'])

def test_username_surrounded_japanese(self):
result = self.parser.parse(u'あ@usernameの')
self.assertEqual(result.html, u'あ<a href="http://twitter.com/username">@username</a>の')
self.assertEqual(result.users, [u'username'])

def test_username_followed_punctuation(self):
result = self.parser.parse(u'@username&^$%^')
self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a>&^$%^')
self.assertEqual(result.users, [u'username'])

def test_not_username_spaced(self):
result = self.parser.parse(u'@ username')
self.assertEqual(result.html, u'@ username')
self.assertEqual(result.users, [])

def test_username_beginning(self):
result = self.parser.parse(u'@username text')
self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a> text')
self.assertEqual(result.users, [u'username'])

def test_username_to_long(self):
result = self.parser.parse(u'@username9012345678901')
self.assertEqual(result.html, u'<a href="http://twitter.com/username901234567890">@username901234567890</a>1')
self.assertEqual(result.users, [u'username901234567890'])

def test_username_full_at_sign(self):
result = self.parser.parse(u'@username')
self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a>')
self.assertEqual(result.users, [u'username'])

def test_username_trailing(self):
result = self.parser.parse(u'text @username')
self.assertEqual(result.html, u'text <a href="http://twitter.com/username">@username</a>')
self.assertEqual(result.users, [u'username'])

# Replies
def test_username_reply_simple(self):
result = self.parser.parse(u'@username')
self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a>')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.reply, u'username')

def test_username_reply_whitespace(self):
result = self.parser.parse(u' @username')
self.assertEqual(result.html, u' <a href="http://twitter.com/username">@username</a>')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.reply, u'username')

def test_username_reply_full(self):
result = self.parser.parse(u' @username')
self.assertEqual(result.html, u' <a href="http://twitter.com/username">@username</a>')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.reply, u'username')

def test_username_non_reply(self):
result = self.parser.parse(u'test @username')
self.assertEqual(result.html, u'test <a href="http://twitter.com/username">@username</a>')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.reply, None)


# List tests ---------------------------------------------------------------
# --------------------------------------------------------------------------
def test_list_preceeded(self):
result = self.parser.parse(u'text @username/list')
self.assertEqual(result.html, u'text <a href="http://twitter.com/username/list">@username/list</a>')
self.assertEqual(result.lists, [(u'username', u'list')])

def test_list_beginning(self):
result = self.parser.parse(u'@username/list')
self.assertEqual(result.html, u'<a href="http://twitter.com/username/list">@username/list</a>')
self.assertEqual(result.lists, [(u'username', u'list')])

def test_list_preceeded_punctuation(self):
result = self.parser.parse(u'.@username/list')
self.assertEqual(result.html, u'.<a href="http://twitter.com/username/list">@username/list</a>')
self.assertEqual(result.lists, [(u'username', u'list')])

def test_list_followed_punctuation(self):
result = self.parser.parse(u'@username/list&^$%^')
self.assertEqual(result.html, u'<a href="http://twitter.com/username/list">@username/list</a>&^$%^')
self.assertEqual(result.lists, [(u'username', u'list')])

def test_list_not_slash_space(self):
result = self.parser.parse(u'@username/ list')
self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a>/ list')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.lists, [])

def test_list_beginning(self):
result = self.parser.parse(u'@username/list')
self.assertEqual(result.html, u'<a href="http://twitter.com/username/list">@username/list</a>')
self.assertEqual(result.lists, [(u'username', u'list')])

def test_list_not_empty_username(self):
result = self.parser.parse(u'text @/list')
self.assertEqual(result.html, u'text @/list')
self.assertEqual(result.lists, [])

def test_list_not_preceeded_letter(self):
result = self.parser.parse(u'meet@the/beach')
self.assertEqual(result.html, u'meet@the/beach')
self.assertEqual(result.lists, [])

def test_list_long_truncate(self):
result = self.parser.parse(u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A')
self.assertEqual(result.html, u'<a href="http://twitter.com/username/list5678901234567890123456789012345678901234567890123456789012345678901234567890">@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890</a>A')
self.assertEqual(result.lists, [(u'username', u'list5678901234567890123456789012345678901234567890123456789012345678901234567890')])

def test_list_with_dash(self):
result = self.parser.parse(u'text @username/list-foo')
self.assertEqual(result.html, u'text <a href="http://twitter.com/username/list-foo">@username/list-foo</a>')
103 changes: 51 additions & 52 deletions ttp.py
Original file line number Diff line number Diff line change
@@ -60,202 +60,201 @@
PATH_ENDING_CHARS, QUERY_CHARS, QUERY_ENDING_CHARS),
re.IGNORECASE)


# Registered IANA one letter domains
IANA_ONE_LETTER_DOMAINS = ('x.com', 'x.org', 'z.com', 'q.net', 'q.com', 'i.net')


class ParseResult(object):
'''A class containing the results of a parsed Tweet.
Attributes:
- urls:
A list containing all the valid urls in the Tweet.
- users
A list containing all the valid usernames in the Tweet.
- reply
A string containing the username this tweet was a reply to.
This only matches a username at the beginning of the Tweet,
it may however be preceeded by whitespace.
Note: It's generally better to rely on the Tweet JSON/XML in order to
find out if it's a reply or not.
- lists
A list containing all the valid lists in the Tweet.
Each list item is a tuple in the format (username, listname).
- tags
A list containing all the valid tags in theTweet.
- html
A string containg formatted HTML.
To change the formatting sublcass twp.Parser and override the format_*
methods.
'''

def __init__(self, urls, users, reply, lists, tags, html):
self.urls = list(set(urls)) if urls else [] #fixes dups
self.users = list(set(users)) if users else []
self.lists = list(set(lists)) if lists else []
self.reply = list(set(reply)) if reply else []
self.tags = list(set(tags)) if tags else []
self.urls = urls if urls else []
self.users = users if users else []
self.lists = lists if lists else []
self.reply = reply if reply else None
self.tags = tags if tags else []
self.html = html


class Parser(object):
'''A Tweet Parser'''

def __init__(self, max_url_length=30):
self._max_url_length = max_url_length

def parse(self, text, html=True):
'''Parse the text and return a ParseResult instance.'''
self._urls = []
self._users = []
self._lists = []
self._tags = []

reply = REPLY_REGEX.match(text)
reply = reply.groups(0)[0] if reply is not None else None

parsed_html = self._html(text) if html else self._text(text)
return ParseResult(self._urls, self._users, reply,
self._lists, self._tags, parsed_html)

def _text(self, text):
'''Parse a Tweet without generating HTML.'''
URL_REGEX.sub(self._parse_urls, text)
USERNAME_REGEX.sub(self._parse_users, text)
LIST_REGEX.sub(self._parse_lists, text)
HASHTAG_REGEX.sub(self._parse_tags, text)
return None

def _html(self, text):
'''Parse a Tweet and generate HTML.'''
html = URL_REGEX.sub(self._parse_urls, text)
html = USERNAME_REGEX.sub(self._parse_users, html)
html = LIST_REGEX.sub(self._parse_lists, html)
return HASHTAG_REGEX.sub(self._parse_tags, html)


# Internal parser stuff ----------------------------------------------------
def _parse_urls(self, match):
'''Parse URLs.'''

mat = match.group(0)

# Fix a bug in the regex concerning www...com and www.-foo.com domains
# TODO fix this in the regex instead of working around it here
domain = match.group(5)
if domain[0] in '.-':
return mat

# Only allow IANA one letter domains that are actually registered
if len(domain) == 5 \
and domain[-4:].lower() in ('.com', '.org', '.net') \
and not domain.lower() in IANA_ONE_LETTER_DOMAINS:

return mat

# Check for urls without http(s)
pos = mat.find('http')
if pos != -1:
pre, url = mat[:pos], mat[pos:]
full_url = url

# Find the www and force http://
else:
pos = mat.lower().find('www')
pre, url = mat[:pos], mat[pos:]
full_url = 'http://%s' % url

self._urls.append(url)

if self._html:
return '%s%s' % (pre, self.format_url(full_url,
self._shorten_url(escape(url))))

def _parse_users(self, match):
'''Parse usernames.'''

# Don't parse lists here
if match.group(2) is not None:
return match.group(0)

mat = match.group(0)
self._users.append(mat[1:])

if self._html:
return self.format_username(mat[0:1], mat[1:])

def _parse_lists(self, match):
'''Parse lists.'''

# Don't parse usernames here
if match.group(4) is None:
return match.group(0)

pre, at_char, user, list_name = match.groups()
list_name = list_name[1:]
self._lists.append((user, list_name))

if self._html:
return '%s%s' % (pre, self.format_list(at_char, user, list_name))

def _parse_tags(self, match):
'''Parse hashtags.'''

mat = match.group(0)

# Fix problems with the regex capturing stuff infront of the #
tag = None
for i in u'#\uff03':
pos = mat.rfind(i)
if pos != -1:
tag = i
break

pre, text = mat[:pos], mat[pos + 1:]
self._tags.append(text)

if self._html:
return '%s%s' % (pre, self.format_tag(tag, text))

def _shorten_url(self, text):
'''Shorten a URL and make sure to not cut of html entities.'''

if len(text) > self._max_url_length and self._max_url_length != -1:
text = text[0:self._max_url_length - 3]
amp = text.rfind('&')
close = text.rfind(';')
if amp != -1 and (close == -1 or close < amp):
text = text[0:amp]

return text + '...'

else:
return text


# User defined formatters --------------------------------------------------
def format_tag(self, tag, text):
'''Return formatted HTML for a hashtag.'''
return '<a href="http://search.twitter.com/search?q=%s">%s%s</a>' \
% (urllib.quote('#' + text.encode('utf-8')), tag, text)

def format_username(self, at_char, user):
'''Return formatted HTML for a username.'''
return '<a href="http://twitter.com/%s">%s%s</a>' \
% (user, at_char, user)

def format_list(self, at_char, user, list_name):
'''Return formatted HTML for a list.'''
return '<a href="http://twitter.com/%s/%s">%s%s/%s</a>' \
% (user, list_name, at_char, user, list_name)

def format_url(self, url, text):
'''Return formatted HTML for a url.'''
return '<a href="%s">%s</a>' % (escape(url), text)