diff --git a/dexter.core b/dexter.core new file mode 100644 index 00000000..488691b8 --- /dev/null +++ b/dexter.core @@ -0,0 +1,283 @@ +%!PS-Adobe-3.0 +%%Creator: (ImageMagick) +%%Title: (dexter.core) +%%CreationDate: (2018-02-13T12:57:01+02:00) +%%BoundingBox: 2462 297 2496 298 +%%HiResBoundingBox: 2462 297 2496 298 +%%DocumentData: Clean7Bit +%%LanguageLevel: 1 +%%Orientation: Portrait +%%PageOrder: Ascend +%%Pages: 1 +%%EndComments + +%%BeginDefaults +%%EndDefaults + +%%BeginProlog +% +% Display a color image. The image is displayed in color on +% Postscript viewers or printers that support color, otherwise +% it is displayed as grayscale. +% +/DirectClassPacket +{ + % + % Get a DirectClass packet. + % + % Parameters: + % red. + % green. + % blue. + % length: number of pixels minus one of this color (optional). + % + currentfile color_packet readhexstring pop pop + compression 0 eq + { + /number_pixels 3 def + } + { + currentfile byte readhexstring pop 0 get + /number_pixels exch 1 add 3 mul def + } ifelse + 0 3 number_pixels 1 sub + { + pixels exch color_packet putinterval + } for + pixels 0 number_pixels getinterval +} bind def + +/DirectClassImage +{ + % + % Display a DirectClass image. + % + systemdict /colorimage known + { + columns rows 8 + [ + columns 0 0 + rows neg 0 rows + ] + { DirectClassPacket } false 3 colorimage + } + { + % + % No colorimage operator; convert to grayscale. + % + columns rows 8 + [ + columns 0 0 + rows neg 0 rows + ] + { GrayDirectClassPacket } image + } ifelse +} bind def + +/GrayDirectClassPacket +{ + % + % Get a DirectClass packet; convert to grayscale. + % + % Parameters: + % red + % green + % blue + % length: number of pixels minus one of this color (optional). + % + currentfile color_packet readhexstring pop pop + color_packet 0 get 0.299 mul + color_packet 1 get 0.587 mul add + color_packet 2 get 0.114 mul add + cvi + /gray_packet exch def + compression 0 eq + { + /number_pixels 1 def + } + { + currentfile byte readhexstring pop 0 get + /number_pixels exch 1 add def + } ifelse + 0 1 number_pixels 1 sub + { + pixels exch gray_packet put + } for + pixels 0 number_pixels getinterval +} bind def + +/GrayPseudoClassPacket +{ + % + % Get a PseudoClass packet; convert to grayscale. + % + % Parameters: + % index: index into the colormap. + % length: number of pixels minus one of this color (optional). + % + currentfile byte readhexstring pop 0 get + /offset exch 3 mul def + /color_packet colormap offset 3 getinterval def + color_packet 0 get 0.299 mul + color_packet 1 get 0.587 mul add + color_packet 2 get 0.114 mul add + cvi + /gray_packet exch def + compression 0 eq + { + /number_pixels 1 def + } + { + currentfile byte readhexstring pop 0 get + /number_pixels exch 1 add def + } ifelse + 0 1 number_pixels 1 sub + { + pixels exch gray_packet put + } for + pixels 0 number_pixels getinterval +} bind def + +/PseudoClassPacket +{ + % + % Get a PseudoClass packet. + % + % Parameters: + % index: index into the colormap. + % length: number of pixels minus one of this color (optional). + % + currentfile byte readhexstring pop 0 get + /offset exch 3 mul def + /color_packet colormap offset 3 getinterval def + compression 0 eq + { + /number_pixels 3 def + } + { + currentfile byte readhexstring pop 0 get + /number_pixels exch 1 add 3 mul def + } ifelse + 0 3 number_pixels 1 sub + { + pixels exch color_packet putinterval + } for + pixels 0 number_pixels getinterval +} bind def + +/PseudoClassImage +{ + % + % Display a PseudoClass image. + % + % Parameters: + % class: 0-PseudoClass or 1-Grayscale. + % + currentfile buffer readline pop + token pop /class exch def pop + class 0 gt + { + currentfile buffer readline pop + token pop /depth exch def pop + /grays columns 8 add depth sub depth mul 8 idiv string def + columns rows depth + [ + columns 0 0 + rows neg 0 rows + ] + { currentfile grays readhexstring pop } image + } + { + % + % Parameters: + % colors: number of colors in the colormap. + % colormap: red, green, blue color packets. + % + currentfile buffer readline pop + token pop /colors exch def pop + /colors colors 3 mul def + /colormap colors string def + currentfile colormap readhexstring pop pop + systemdict /colorimage known + { + columns rows 8 + [ + columns 0 0 + rows neg 0 rows + ] + { PseudoClassPacket } false 3 colorimage + } + { + % + % No colorimage operator; convert to grayscale. + % + columns rows 8 + [ + columns 0 0 + rows neg 0 rows + ] + { GrayPseudoClassPacket } image + } ifelse + } ifelse +} bind def + +/DisplayImage +{ + % + % Display a DirectClass or PseudoClass image. + % + % Parameters: + % x & y translation. + % x & y scale. + % label pointsize. + % image label. + % image columns & rows. + % class: 0-DirectClass or 1-PseudoClass. + % compression: 0-none or 1-RunlengthEncoded. + % hex color packets. + % + gsave + /buffer 512 string def + /byte 1 string def + /color_packet 3 string def + /pixels 768 string def + + currentfile buffer readline pop + token pop /x exch def + token pop /y exch def pop + x y translate + currentfile buffer readline pop + token pop /x exch def + token pop /y exch def pop + currentfile buffer readline pop + token pop /pointsize exch def pop + /Times-Roman findfont pointsize scalefont setfont + x y scale + currentfile buffer readline pop + token pop /columns exch def + token pop /rows exch def pop + currentfile buffer readline pop + token pop /class exch def pop + currentfile buffer readline pop + token pop /compression exch def pop + class 0 gt { PseudoClassImage } { DirectClassImage } ifelse + grestore + showpage +} bind def +%%EndProlog +%%Page: 1 1 +%%PageBoundingBox: 2462 297 2496 298 +DisplayImage +2462 297 +34 1 +12 +34 1 +0 +0 +B2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BB +B2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BB +B2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BBB2B4BB + +%%PageTrailer +%%Trailer +%%EOF diff --git a/dexter/models/country.py b/dexter/models/country.py index 06969d77..84f88ae5 100644 --- a/dexter/models/country.py +++ b/dexter/models/country.py @@ -50,6 +50,10 @@ def create_defaults(cls): Germany|de United Kingdom (Great Britain)|gb Kenya|ke +Nigeria|ng +France|fr +United States of America|us +China|cn """ countries = [] diff --git a/dexter/models/fdi.py b/dexter/models/fdi.py index f18fccbd..beeea7f0 100644 --- a/dexter/models/fdi.py +++ b/dexter/models/fdi.py @@ -775,7 +775,7 @@ class Involvements2(db.Model): __tablename__ = "involvements2" id = Column(Integer, primary_key=True) - name = Column(String(50), index=True, nullable=False, unique=True) + name = Column(String(128), index=True, nullable=False, unique=True) def __repr__(self): return "" % (self.name) @@ -878,7 +878,7 @@ class Involvements3(db.Model): __tablename__ = "involvements3" id = Column(Integer, primary_key=True) - name = Column(String(50), index=True, nullable=False, unique=True) + name = Column(String(128), index=True, nullable=False, unique=True) def __repr__(self): return "" % (self.name) diff --git a/dexter/models/medium.py b/dexter/models/medium.py index 87f34556..a5876249 100644 --- a/dexter/models/medium.py +++ b/dexter/models/medium.py @@ -40,7 +40,8 @@ def is_tld_exception(cls, url): """ url_exceptions = [ 'thecitizen.co.tz', - 'dailynews.co.tz' + 'dailynews.co.tz', + 'mathewnyaungwa.blogspot.co.za' ] for ex in url_exceptions: # check if it exists in the url add buffer for [https://www.] characters at start @@ -51,10 +52,12 @@ def is_tld_exception(cls, url): @classmethod def for_url(cls, url): + sub_domain_exception_list = [ + 'blogspot.co.za' + ] domain = get_tld(url, fail_silently=True) # fail silently - - if domain is None: + if domain is None or domain in sub_domain_exception_list: domain = cls.is_tld_exception(url) if domain is None: @@ -175,6 +178,56 @@ def create_defaults(cls): The East African|online|theeastafrican.co.ke||ke Daily News (Tanzania)|online|dailynews.co.tz||tz Daily News (Zimbabwe)|online|dailynews.co.zw||tz +SAVCA|online|savca.co.za||za +How We Made It In Africa|online|howwemadeitinafrica.com||za +Rhodes University (MathewYaungwaBlog)|online|mathewnyaungwa.blogspot.co.za||za +World Stage|online|worldstagegroup.com||ng +Classic FM|online|classic97.net||ng +Agence France Presse|online|afp.com||fr +Naija News Agency|online|naijanewsagency.com||ng +Daily Trust Newspaper|online|dailytrust.com.ng||ng +Daily Telegraph New Telegraph Online|online|newtelegraphonline.com||ng +The Point|online|thepointng.com||ng +The Daily Times|online|dailytimes.ng||ng +The Nation Online|online|thenationonlineng.net||ng +Media Max Network|online|mediamaxnetwork.co.ke||ke +Leadership|online|leadership.ng||ng +The Interview|online|theinterview.com.ng||ng +RSA Parliament|online|parliament.gov.za||za +Guardian|online|guardian.ng||ng +Naitional Daily Nigeria|online|nationaldailyng.com||ng +Nigerian Television Authority|online|nta.ng||ng +ACDIVOCA|online|acdivoca.org||us +This Day Live|online|thisdaylive.com||ng +Channel Africa|online|channelafrica.co.za||za +News Agency Of Nigeria|online|nan.ng||ng +Nigeria Today|online|nigeriatoday.ng||ng +Business Day Online|online|businessdayonline.com||ng +Standard Media KTN News|online|standardmedia.co.ke/ktnnews||ke +Global Times China|online|globaltimes.cn||cn +National Mirror|online|nationalmirroronline.net||ng +Monitor Kenya|online|monitor.co.ke||ke +Newsverge|online|newsverge.com||ng +Sundiata Post|online|sundiatapost.com||ng +Agrilinks|online|agrilinks.org||us +Business Daily Africa|online|businessdailyafrica.com||ke +The Business Post|online|thebusinesspost.ng||ng +The Guardian UK|online|theguardian.com||gb +Independent NG|online|independent.ng||ng +The Nerve Africa|online|thenerveafrica.com||ng +Ameh News|online|amehnews.com||ng +Sun News Online|online|sunnewsonline.com||ng +Seed Magazine|online|seedmagazine.co.ke||ke +Business Hallmark News|online|hallmarknews.com||ng +Destiny Connect|online|destinyconnect.com||za +The Economist|online|economist.com||us +Washington Post|online|washingtonpost.com||us +Ama Bhungane|online|amabhungane.co.za||za +Africa Investor|online|africainvestor.com||za +Outrepreneurs|online|outrepreneurs.com||ng +CNBC Africa|online|cnbcafrica.com||za +Plan International|online|plan-international.org||gb +Bloomberg|online|bloomberg.com||za """ mediums = [] diff --git a/dexter/models/seeds.py b/dexter/models/seeds.py index 03157157..91c62851 100644 --- a/dexter/models/seeds.py +++ b/dexter/models/seeds.py @@ -4,80 +4,80 @@ def seed_db(db): """ Add seed entities to the database. """ with app.app_context(): - # for x in AnalysisNature.create_defaults(): - # db.session.add(x) - # - # for x in Country.create_defaults(): - # db.session.add(x) - # db.session.flush() - # - # for x in User.create_defaults(): - # db.session.add(x) - # - # for x in Medium.create_defaults(): - # db.session.add(x) - # - # for x in Gender.create_defaults(): - # db.session.add(x) - # - # for x in Race.create_defaults(): - # db.session.add(x) - # - # for x in SourceFunction.create_defaults(): - # db.session.add(x) - # - # for x in Topic.create_defaults(): - # db.session.add(x) - # - # for x in DocumentType.create_defaults(): - # db.session.add(x) - # - # for x in AuthorType.create_defaults(): - # db.session.add(x) - # - # for x in Issue.create_defaults(): - # db.session.add(x) - # - # for x in Fairness.create_defaults(): - # db.session.add(x) - # - # for x in Affiliation.create_defaults(): - # db.session.add(x) - # - # for x in SourceRole.create_defaults(): - # db.session.add(x) - # - # for x in InvestmentType.create_defaults(): - # db.session.add(x) - # - # for x in InvestmentOrigins.create_defaults(): - # db.session.add(x) - # - # for x in Sectors.create_defaults(): - # db.session.add(x) - # - # for x in Phases.create_defaults(): - # db.session.add(x) - # - # for x in Currencies.create_defaults(): - # db.session.add(x) - # - # for x in Industries.create_defaults(): - # db.session.add(x) - # + for x in AnalysisNature.create_defaults(): + db.session.add(x) + + for x in Country.create_defaults(): + db.session.add(x) + db.session.flush() + + for x in User.create_defaults(): + db.session.add(x) + + for x in Medium.create_defaults(): + db.session.add(x) + + for x in Gender.create_defaults(): + db.session.add(x) + + for x in Race.create_defaults(): + db.session.add(x) + + for x in SourceFunction.create_defaults(): + db.session.add(x) + + for x in Topic.create_defaults(): + db.session.add(x) + + for x in DocumentType.create_defaults(): + db.session.add(x) + + for x in AuthorType.create_defaults(): + db.session.add(x) + + for x in Issue.create_defaults(): + db.session.add(x) + + for x in Fairness.create_defaults(): + db.session.add(x) + + for x in Affiliation.create_defaults(): + db.session.add(x) + + for x in SourceRole.create_defaults(): + db.session.add(x) + + for x in InvestmentType.create_defaults(): + db.session.add(x) + + for x in InvestmentOrigins.create_defaults(): + db.session.add(x) + + for x in Sectors.create_defaults(): + db.session.add(x) + + for x in Phases.create_defaults(): + db.session.add(x) + + for x in Currencies.create_defaults(): + db.session.add(x) + + for x in Industries.create_defaults(): + db.session.add(x) + # for x in Involvements.create_defaults(): # db.session.add(x) - # - # for x in ValueUnits.create_defaults(): - # db.session.add(x) - # - # db.session.flush() - # - # for x in Principle.create_defaults(): - # db.session.add(x) - # - # for x in Role.create_defaults(): - # db.session.add(x) + + for x in ValueUnits.create_defaults(): + db.session.add(x) + + db.session.flush() + + for x in Principle.create_defaults(): + db.session.add(x) + + for x in Role.create_defaults(): + db.session.add(x) for x in Provinces.create_defaults(): db.session.add(x) diff --git a/dexter/processing/crawlers/__init__.py b/dexter/processing/crawlers/__init__.py index 878dd9f3..a1e92fed 100644 --- a/dexter/processing/crawlers/__init__.py +++ b/dexter/processing/crawlers/__init__.py @@ -18,4 +18,54 @@ from .newsdayzw import NewsDayZWCrawler from .dwcom import DWCrawler from .chroniclezw import ChronicleZWCrawler -from .bbc import BBCCrawler \ No newline at end of file +from .bbc import BBCCrawler +from .howwemadeitinafrica import HowWeMadeItInAfricaCrawler +from .savca import SAVCACrawler +from .rhodesunimathewblog import RhodesUniMathewBlogCrawler +from .worldstage import WorldStageCrawler +from .classicfm import ClassicFMCrawler +from .afp import AFPCrawler +from .naijanews import NaijaNewsCrawler +from .dailytrustnp import DailyTrustNPCrawler +from .newteleonline import NewTeleOnlineCrawler +from .thepoint import ThePointCrawler +from .dailytimes import DailyTimesCrawler +from .thenation import TheNationCrawler +from .mediamaxnet import MediaMaxNetCrawler +from .leadership import LeadershipCrawler +from .theinterview import TheInterviewCrawler +from .rsaparliament import RSAParliamentCrawler +from .guardian import GuardianCrawler +from .nationaldailyng import NationalDailyNgCrawler +from .nta import NTACrawler +from .acdivoca import ACDIVOCACrawler +from .thisdaylive import ThisDayLiveCrawler +from .channelafrica import ChannelAfricaCrawler +from .nan import NANCrawler +from .nigeriatoday import NigeriaTodayCrawler +from .businessdayonline import BusinessDayOnlineCrawler +from .standardmediaktnnews import StandardMediaKTNCrawler +from .globaltimescn import GlobalTimesCN +from .nationalmirror import NationalMirrorCrawler +from .monitorke import MonitorKECrawler +from .newsverge import NewsvergeCrawler +from .sundiatapost import SundiataPostCrawler +from .agrilinks import AgrilinksCrawler +from .businessdailyafrica import BusinessDailyAfricaCrawler +from .thebusinesspost import TheBusinessPostCrawler +from .theguardianuk import TheGuardianUKCrawler +from .independentng import IndependentNGCrawler +from .thenerveafrica import TheNerveAfricaCrawler +from .amehnews import AmehNewsCrawler +from .sunnewsonline import SunNewsOnlineCrawler +from .seedmagazine import SeedMagazineCrawler +from .hallmarknews import HallmarkNewsCrawler +from .destinyconnect import DestinyConnectCrawler +from .economist import EconomistCrawler +from .washingtonpost import WashingtonPostCrawler +from .amabhungane import AmaBhunganeCrawler +from .africainvestor import AfricaInvestorCrawler +from .outrepreneurs import OutrepreneursCrawler +from .cnbcafrica import CNBCAfricaCrawler +from .planintl import PlanIntlCrawler +from .bloomberg import BloombergCrawler \ No newline at end of file diff --git a/dexter/processing/crawlers/acdivoca.py b/dexter/processing/crawlers/acdivoca.py new file mode 100644 index 00000000..03cc080c --- /dev/null +++ b/dexter/processing/crawlers/acdivoca.py @@ -0,0 +1,57 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class ACDIVOCACrawler(BaseCrawler): + ACDIVO_RE = re.compile('(www\.)?acdivoca.org') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.ACDIVO_RE.match(parts.netloc)) + + def fetch(self, url): + """ + Fetch and return the raw HTML for this url. + The return content is a unicode string. + """ + self.log.info("Fetching URL: " + url) + + headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'} + + r = requests.get(url, headers=headers, timeout=10) + # raise an HTTPError on badness + r.raise_for_status() + + # this decodes r.content using a guessed encoding + return r.text + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(ACDIVOCACrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#main #page-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('#main .meta-top .date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#main .main-content p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('#main .meta-author h3 a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/afp.py b/dexter/processing/crawlers/afp.py new file mode 100644 index 00000000..1d4afe10 --- /dev/null +++ b/dexter/processing/crawlers/afp.py @@ -0,0 +1,52 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class AFPCrawler(BaseCrawler): + AFP_RE = re.compile('(www\.)?afp.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.AFP_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(AFPCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.container .article_content h3.htitle')) + + #gather publish date + #date_nodes = soup.select('.container .article_content .article_content_meta .article_content_date span') + date = self.extract_plaintext(soup.select('.container .article_content .article_content_meta .article_content_date span')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.container .article_content .textcontent p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/africainvestor.py b/dexter/processing/crawlers/africainvestor.py new file mode 100644 index 00000000..278c1404 --- /dev/null +++ b/dexter/processing/crawlers/africainvestor.py @@ -0,0 +1,62 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class AfricaInvestorCrawler(BaseCrawler): + AI_RE = re.compile('(www\.)?africainvestor.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.AI_RE.match(parts.netloc)) + + + def fetch(self, url): + """ + Fetch and return the raw HTML for this url. + The return content is a unicode string. + """ + self.log.info("Fetching URL: " + url) + + headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'} + + r = requests.get(url, headers=headers, timeout=10) + # raise an HTTPError on badness + r.raise_for_status() + + # this decodes r.content using a guessed encoding + return r.text + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(AfricaInvestorCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('article.post .td-post-title .entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('article.post .td-post-title .td-module-meta-info .td-post-date time.entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('article.post .td-post-content') + text_list = [] + for node in nodes[0].children: + if node.name in ['h5','p']: + text_list = text_list + [node] + doc.summary = "\n\n".join(p.text.strip() for p in text_list[:1]) + doc.text = "\n\n".join(p.text.strip() for p in text_list) + + # gather author + author = self.extract_plaintext(soup.select('article.post .td-post-title .td-module-meta-info .td-post-author-name a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/agrilinks.py b/dexter/processing/crawlers/agrilinks.py new file mode 100644 index 00000000..fb8540f3 --- /dev/null +++ b/dexter/processing/crawlers/agrilinks.py @@ -0,0 +1,62 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class AgrilinksCrawler(BaseCrawler): + A_RE = re.compile('agrilinks.org') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.A_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(AgrilinksCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#content-main .node-post .field-name-title .page-header')) + + #gather publish date + date = self.extract_plaintext(soup.select('#content-main .node-post .group-post-info .field-name-post-date .field-item')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#content-main .node-post .field-name-body .field-item') + child_list = [] + for child in nodes[0].descendants: + if isinstance(child, basestring): + child_list.append(child) + doc.summary = " ".join(p.strip() for p in child_list[:3]) + doc.text = " ".join(p for p in child_list) + + # gather author + author = '' + author_nodes = soup.select('#content-main .node-post .group-post-info .group-author-name a') + if author_nodes: + author = ''.join(a.text.strip() for a in author_nodes) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/amabhungane.py b/dexter/processing/crawlers/amabhungane.py new file mode 100644 index 00000000..bb043673 --- /dev/null +++ b/dexter/processing/crawlers/amabhungane.py @@ -0,0 +1,46 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class AmaBhunganeCrawler(BaseCrawler): + AB_RE = re.compile('amabhungane.co.za') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.AB_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + raw_html = raw_html.encode("utf-8") + raw_html = unicode(raw_html, errors='ignore') + + super(AmaBhunganeCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.large-12 .orbit-caption h5')) + + #gather text and summary + article_nodes = soup.select('.large-8 > .row') + body_nodes = article_nodes[1].select('.large-12 > p') + doc.summary = "\n\n".join(p.text.strip() for p in body_nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in body_nodes) + + date_author = self.extract_plaintext(soup.select('.large-12 .orbit-caption time')) + #gather publish date + date = date_author[:date_author.index('-') - 1].strip() + doc.published_at = self.parse_timestamp(date) + + # gather author + author = date_author[date_author.index('-') + 1:] + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/amehnews.py b/dexter/processing/crawlers/amehnews.py new file mode 100644 index 00000000..8111e7c7 --- /dev/null +++ b/dexter/processing/crawlers/amehnews.py @@ -0,0 +1,41 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class AmehNewsCrawler(BaseCrawler): + AN_RE = re.compile('amehnews.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.AN_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(AmehNewsCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('article.post h1.entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('article.post .entry-meta .entry-date time.entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('article.post .entry-content > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('article.post .entry-meta .entry-author a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/bloomberg.py b/dexter/processing/crawlers/bloomberg.py new file mode 100644 index 00000000..f70df06d --- /dev/null +++ b/dexter/processing/crawlers/bloomberg.py @@ -0,0 +1,70 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class BloombergCrawler(BaseCrawler): + B_RE = re.compile('(www\.)?bloomberg.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.B_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(BloombergCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('article .lede-text-only .lede-text-only__content .lede-text-only__hed .lede-text-only__highlight')) + + #gather publish date + date_node = soup.select('article .lede-text-only .lede-text-only__content time.article-timestamp') + date = '' + for node in date_node[0].children: + if node.name == 'noscript': + date = node.text.strip() + doc.published_at = self.parse_timestamp(date) + + + + #gather text and summary + summary_nodes = soup.select('article .content-well .abstract li') + doc.summary = "\n\n".join(p.text.strip() for p in summary_nodes) + + nodes = soup.select('article .content-well .body-copy') + text_list = [] + for node in nodes[0].children: + if node.name in ['h3','p']: + text_list = text_list + [node] + doc.text = "\n\n".join(p.text.strip() for p in text_list) + + # gather author + author = [] + author_nodes = soup.select('article .lede-text-only .lede-text-only__content .author') + for node in author_nodes: + author += [node.find(text=True).strip()] + if author: + doc.author = Author.get_or_create(','.join(author), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/businessdailyafrica.py b/dexter/processing/crawlers/businessdailyafrica.py new file mode 100644 index 00000000..5ebb3db5 --- /dev/null +++ b/dexter/processing/crawlers/businessdailyafrica.py @@ -0,0 +1,60 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class BusinessDailyAfricaCrawler(BaseCrawler): + BDA_RE = re.compile('(www\.)?businessdailyafrica.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.BDA_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(BusinessDailyAfricaCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('article.article-story .page-box-inner header .article-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('article.article-story .page-box-inner header .byline')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('article.article-story .page-box-inner p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = '' + byline = self.extract_plaintext(soup.select('article.article-story .page-box-inner header .mobileShow .byline')) + if 'BY ' in byline: + author = byline[byline.index('BY ') + 3:] + else: + author = byline + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/businessdayonline.py b/dexter/processing/crawlers/businessdayonline.py new file mode 100644 index 00000000..759083ae --- /dev/null +++ b/dexter/processing/crawlers/businessdayonline.py @@ -0,0 +1,41 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class BusinessDayOnlineCrawler(BaseCrawler): + BDO_RE = re.compile('(www\.)?businessdayonline.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.BDO_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(BusinessDayOnlineCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.container article h1 a')) + + #gather publish date + date = self.extract_plaintext(soup.select('.container article .date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.container article p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('.container article .author a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/channelafrica.py b/dexter/processing/crawlers/channelafrica.py new file mode 100644 index 00000000..96d16bf9 --- /dev/null +++ b/dexter/processing/crawlers/channelafrica.py @@ -0,0 +1,38 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class ChannelAfricaCrawler(BaseCrawler): + CA_RE = re.compile('(www\.)?channelafrica.co.za') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.CA_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(ChannelAfricaCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.MainContentInner .inPageHeader')) + + #gather publish date + date = self.extract_plaintext(soup.select('.MainContentInner .datesContainer .dates')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + summary = self.extract_plaintext(soup.select('.MainContentInner .ArticleInner .excerpt p')) + doc.summary = summary + nodes = soup.select('.MainContentInner .ArticleInner .articleBody p') + doc.text = summary + '\n\n' + "\n\n".join(p.text.strip() for p in nodes) + + # gather author + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/classicfm.py b/dexter/processing/crawlers/classicfm.py new file mode 100644 index 00000000..6776329e --- /dev/null +++ b/dexter/processing/crawlers/classicfm.py @@ -0,0 +1,37 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class ClassicFMCrawler(BaseCrawler): + CFM_RE = re.compile('www.classic97.net') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.CFM_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(ClassicFMCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#maincontent #page-title')) + + #gather publish date + date = doc.url[-10:] + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#maincontent .field-item p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/cnbcafrica.py b/dexter/processing/crawlers/cnbcafrica.py new file mode 100644 index 00000000..40a4d90c --- /dev/null +++ b/dexter/processing/crawlers/cnbcafrica.py @@ -0,0 +1,60 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class CNBCAfricaCrawler(BaseCrawler): + CBNCA_RE = re.compile('(www\.)?cnbcafrica.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.CBNCA_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(CNBCAfricaCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('article.post .td-post-header .td-post-title .entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('article.post .td-post-header .td-module-meta-info .td-post-date time.entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('article.post .td-post-content') + text_list = [] + for node in nodes[0].children: + if node.name in ['h3','h4','h5','p']: + text_list = text_list + [node] + doc.summary = "\n\n".join(p.text.strip() for p in text_list[:3]) + doc.text = "\n\n".join(p.text.strip() for p in text_list) + + + # gather author + author = self.extract_plaintext(soup.select('article.post .td-post-header .td-module-meta-info .td-post-author-name a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/dailytimes.py b/dexter/processing/crawlers/dailytimes.py new file mode 100644 index 00000000..72c8e1a7 --- /dev/null +++ b/dexter/processing/crawlers/dailytimes.py @@ -0,0 +1,55 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class DailyTimesCrawler(BaseCrawler): + DT_RE = re.compile('dailytimes.ng') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.DT_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(DailyTimesCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.post .post-header h1.post-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.post .post-header .post-byline')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.post .post-content p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('.post .post-author h3')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/dailytrustnp.py b/dexter/processing/crawlers/dailytrustnp.py new file mode 100644 index 00000000..5cfbe856 --- /dev/null +++ b/dexter/processing/crawlers/dailytrustnp.py @@ -0,0 +1,62 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class DailyTrustNPCrawler(BaseCrawler): + DTNP_RE = re.compile('(www\.)?dailytrust.com.ng') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.DTNP_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(DailyTrustNPCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.container .story h1')) + + author_date_str = self.extract_plaintext(soup.select('.container .story span.storydate')) + + #gather publish date + date = author_date_str[author_date_str.index('Publish Date:') + 13:].strip() + print "This date %s" % (date) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.container .fullstory p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + if '-' not in author_date_str: + author = author_date_str[author_date_str.index('By') + 2 : author_date_str.index('|')].strip() + + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/destinyconnect.py b/dexter/processing/crawlers/destinyconnect.py new file mode 100644 index 00000000..12e853bb --- /dev/null +++ b/dexter/processing/crawlers/destinyconnect.py @@ -0,0 +1,41 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class DestinyConnectCrawler(BaseCrawler): + DC_RE = re.compile('(www\.)?destinyconnect.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.DC_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(DestinyConnectCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('article.post .entry-header .entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('article.post .entry-header .entry-meta time.entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('article.post .entry-content > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('article.post .entry-header .entry-meta .byline .author')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/economist.py b/dexter/processing/crawlers/economist.py new file mode 100644 index 00000000..944c3a1a --- /dev/null +++ b/dexter/processing/crawlers/economist.py @@ -0,0 +1,61 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class EconomistCrawler(BaseCrawler): + E_RE = re.compile('(www\.)?economist.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.E_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(EconomistCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('article.blog-post .flytitle-and-title__body .flytitle-and-title__title')) + + #gather publish date + date = self.extract_plaintext(soup.select('article.blog-post .blog-post__section-date-author time.blog-post__datetime')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('article.blog-post .blog-post__text > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author_byline = self.extract_plaintext(soup.select('article.blog-post .blog-post__section-date-author .blog-post__byline-container .blog-post__byline')) + author = '' + if '|' in author_byline: + author = author_byline[:author_byline.index('|') -1] + else: + author = author_byline + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() + diff --git a/dexter/processing/crawlers/globaltimescn.py b/dexter/processing/crawlers/globaltimescn.py new file mode 100644 index 00000000..0c8fca6a --- /dev/null +++ b/dexter/processing/crawlers/globaltimescn.py @@ -0,0 +1,50 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class GlobalTimesCN(BaseCrawler): + GTCN_RE = re.compile('(www\.)?globaltimes.cn') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.GTCN_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(GlobalTimesCN, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#contents #left .article-title h3')) + + #gather publish date + date = '' + source_string = self.extract_plaintext(soup.select('#contents #left .article-source .text-left')) + if 'Published:' in source_string: + date = source_string[source_string.index('Published:') + 10:].strip() + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#contents #left .row-content') + child_list = [] + for child in nodes[0].descendants: + if isinstance(child, basestring): + child_list.append(child) + doc.summary = "\n".join(p for p in child_list[:1]) + doc.text = "\n".join(p for p in child_list) + + # gather author + author = '' + if 'By ' in source_string: + author = source_string[source_string.index('By ') + 3:source_string.index('Source:')-1].strip() + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/guardian.py b/dexter/processing/crawlers/guardian.py new file mode 100644 index 00000000..a45cd1fc --- /dev/null +++ b/dexter/processing/crawlers/guardian.py @@ -0,0 +1,66 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class GuardianCrawler(BaseCrawler): + G_RE = re.compile('guardian.ng') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.G_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(GuardianCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + # gather title + doc.title = self.extract_plaintext(soup.select('.single-post-header h1.single-article-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.page-main .single-article-aside .single-article-datetime')) + doc.published_at = self.parse_timestamp(date[:date.index('|') - 1].strip() + ' ' + date[date.index('|') + 1:].strip()) + + #gather text and summary + summary_list = [] + summary_nodes = soup.select('.page-main .single-article-content article') + for item in summary_nodes[0].find('br').next_siblings: + if item != u'\n': + if isinstance(item, basestring): + summary_list.append(item) + else: + summary_list.append(item.text) + summary = summary_list[0].strip() + doc.summary = summary + text_nodes = soup.select('.page-main .single-article-content article p') + doc.text = summary + "\n\n" + "\n\n".join(p.text.strip() for p in text_nodes[2:]) + + # gather author + author = self.extract_plaintext(soup.select('.page-main .single-article-aside .single-article-author strong')) + if author: + if ',' in author: + doc.author = Author.get_or_create(author[:author.index(',')].strip(), AuthorType.journalist()) + else: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/hallmarknews.py b/dexter/processing/crawlers/hallmarknews.py new file mode 100644 index 00000000..3a3c4799 --- /dev/null +++ b/dexter/processing/crawlers/hallmarknews.py @@ -0,0 +1,43 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class HallmarkNewsCrawler(BaseCrawler): + HN_RE = re.compile('hallmarknews.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.HN_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(HallmarkNewsCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#content .post h1.posttitle')) + + #gather publish date + meta_date = self.extract_plaintext(soup.select('#content #datemeta #datemeta_l')) + date = meta_date[meta_date.index('Published On:') + 13:].strip() + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#content .post .entry > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author_nodes = soup.select('#content #datemeta #datemeta_r a') + author = author_nodes[-1].text + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/howwemadeitinafrica.py b/dexter/processing/crawlers/howwemadeitinafrica.py new file mode 100644 index 00000000..61903fdc --- /dev/null +++ b/dexter/processing/crawlers/howwemadeitinafrica.py @@ -0,0 +1,71 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class HowWeMadeItInAfricaCrawler(BaseCrawler): + HWMIIA_RE = re.compile('(www\.)?howwemadeitinafrica.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.HWMIIA_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def fetch(self, url): + """ + Fetch and return the raw HTML for this url. + The return content is a unicode string. + """ + self.log.info("Fetching URL: " + url) + + headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'} + + r = requests.get(url, headers=headers, timeout=10) + # raise an HTTPError on badness + r.raise_for_status() + + # this decodes r.content using a guessed encoding + return r.text + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(HowWeMadeItInAfricaCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.content-wrapper .post header h1')) + + #gather publish date + date = self.extract_plaintext(soup.select('.content-wrapper .post header p time')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select(".content-wrapper .post .content p") + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select(".content-wrapper .post p a")) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/independentng.py b/dexter/processing/crawlers/independentng.py new file mode 100644 index 00000000..c280edb3 --- /dev/null +++ b/dexter/processing/crawlers/independentng.py @@ -0,0 +1,55 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class IndependentNGCrawler(BaseCrawler): + ING_RE = re.compile('independent.ng') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.ING_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(IndependentNGCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('article.post .td-post-header .td-post-title .entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('article.post .td-post-header .td-post-title .td-module-meta-info .td-post-date time.entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('article.post .td-post-content > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes[:-1]) + + # gather author + author = self.extract_plaintext(soup.select('article.post .td-post-header .td-post-title .td-module-meta-info .td-post-author-name a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/leadership.py b/dexter/processing/crawlers/leadership.py new file mode 100644 index 00000000..fb5a2eaf --- /dev/null +++ b/dexter/processing/crawlers/leadership.py @@ -0,0 +1,59 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class LeadershipCrawler(BaseCrawler): + L_RE = re.compile('leadership.ng') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.L_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(LeadershipCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#mvp-post-main #mvp-post-content h1.mvp-post-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('#mvp-post-main #mvp-post-content .mvp-author-info-date time.post-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#mvp-post-main #mvp-post-content #mvp-content-main > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author_nodes = soup.select('#mvp-post-main #mvp-post-content .mvp-author-info-name .author-name a') + if len(author_nodes) > 1: + author = self.extract_plaintext([author_nodes[1]]) + else: + author = self.extract_plaintext([author_nodes[0]]) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/mediamaxnet.py b/dexter/processing/crawlers/mediamaxnet.py new file mode 100644 index 00000000..f712cb5d --- /dev/null +++ b/dexter/processing/crawlers/mediamaxnet.py @@ -0,0 +1,41 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class MediaMaxNetCrawler(BaseCrawler): + MMN_RE = re.compile('(www\.)?mediamaxnetwork.co.ke') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.MMN_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(MediaMaxNetCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.post header h1.entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.post header time.entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.post .entry-content p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('.post header .author-link a span')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/monitorke.py b/dexter/processing/crawlers/monitorke.py new file mode 100644 index 00000000..d0102060 --- /dev/null +++ b/dexter/processing/crawlers/monitorke.py @@ -0,0 +1,46 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class MonitorKECrawler(BaseCrawler): + MKE_RE = re.compile('(www\.)?monitor.co.ke') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.MKE_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(MonitorKECrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#main-content .post .post-inner .post-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('#main-content .post .post-inner .updated')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#main-content .post .post-inner .entry p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = '' + entry_author = nodes[0].text + if 'By ' in entry_author: + author = entry_author[entry_author.index('By ') + 3:].strip() + else: + author = self.extract_plaintext(soup.select('#main-content .post .post-inner .post-meta .post-meta-author a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/naijanews.py b/dexter/processing/crawlers/naijanews.py new file mode 100644 index 00000000..a00c2cc4 --- /dev/null +++ b/dexter/processing/crawlers/naijanews.py @@ -0,0 +1,56 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class NaijaNewsCrawler(BaseCrawler): + NNA_RE = re.compile('naijanewsagency.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.NNA_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(NaijaNewsCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.main_container h1.post-tile')) + + #gather publish date + #date_nodes = soup.select('.container .article_content .article_content_meta .article_content_date span') + date = self.extract_plaintext(soup.select('.main_container .single-post-meta span time')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.main_container .entry-content') + text_list = [] + for node in nodes[0].children: + if node.name in ['h2','h3','p']: + text_list = text_list + [node] + doc.summary = "\n\n".join(p.text.strip() for p in text_list[:2]) + doc.text = "\n\n".join(p.text.strip() for p in text_list) + + # gather author + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/nan.py b/dexter/processing/crawlers/nan.py new file mode 100644 index 00000000..83a6c1b5 --- /dev/null +++ b/dexter/processing/crawlers/nan.py @@ -0,0 +1,37 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class NANCrawler(BaseCrawler): + NAN_RE = re.compile('(www\.)?nan.ng') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.NAN_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(NANCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.post .article-header .xt-post-title')) + + #gather publish date + date = soup.select('.post article time')[0]['datetime'] + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.post .article-content .post-body p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/nationaldailyng.py b/dexter/processing/crawlers/nationaldailyng.py new file mode 100644 index 00000000..5766ad9f --- /dev/null +++ b/dexter/processing/crawlers/nationaldailyng.py @@ -0,0 +1,66 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class NationalDailyNgCrawler(BaseCrawler): + NDN_RE = re.compile('(www\.)?nationaldailyng.com') + ignore_lst = [ + 'wabtn_container', + 'fb-root', + 'fbcb_container', + 'td-a-rec td-a-rec-id-content_bottom ', + 'td-a-rec td-a-rec-id-content_inline '] + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.NDN_RE.match(parts.netloc)) + + def validate_attrs(self, attrs): + """ Validation test to check if an element is on the ignore list. """ + for item in self.ignore_lst: + if item in attrs.values(): + return False + if 'class' in attrs: + for c in attrs['class']: + if c == item: + return False + + return True + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(NationalDailyNgCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.td-main-content .td-post-header .td-post-title h1.entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.td-main-content .td-post-header .td-post-title .td-post-date time')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.td-main-content .td-post-content') + doc.summary = '' + doc.text = '' + for node in nodes[0].contents: + if not isinstance(node, basestring) and self.validate_attrs(node.attrs): + doc.text += "\n\n" + node.text.strip() if node.text.strip() else '' + if len(doc.summary) < 200: + doc.summary += "\n\n" + node.text.strip() if node.text.strip() else '' + doc.text = doc.text.strip() + doc.summary = doc.summary.strip() + + # gather author + author = self.extract_plaintext(soup.select('.td-main-content .td-post-header .td-post-title .td-post-author-name a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/nationalmirror.py b/dexter/processing/crawlers/nationalmirror.py new file mode 100644 index 00000000..4d1dd269 --- /dev/null +++ b/dexter/processing/crawlers/nationalmirror.py @@ -0,0 +1,51 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class NationalMirrorCrawler(BaseCrawler): + NM_RE = re.compile('(www\.)?nationalmirroronline.net') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.NM_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(NationalMirrorCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#primary .post .entry-header .entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('#primary .post .entry-header .entry-meta .entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#primary .post .entry-content p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/newsverge.py b/dexter/processing/crawlers/newsverge.py new file mode 100644 index 00000000..0deb6f8d --- /dev/null +++ b/dexter/processing/crawlers/newsverge.py @@ -0,0 +1,56 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class NewsvergeCrawler(BaseCrawler): + N_RE = re.compile('newsverge.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.N_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(NewsvergeCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.td-ss-main-content .post .td-post-header .td-post-title .entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.td-ss-main-content .post .td-post-header .td-post-title .td-post-date .entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.td-ss-main-content .post .td-post-content p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author_nodes = soup.select('.td-ss-main-content .post .td-post-header .td-post-title .td-post-author-name a') + author = "\n\n".join(p.text.strip() for p in author_nodes) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/newteleonline.py b/dexter/processing/crawlers/newteleonline.py new file mode 100644 index 00000000..fbd515d2 --- /dev/null +++ b/dexter/processing/crawlers/newteleonline.py @@ -0,0 +1,41 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class NewTeleOnlineCrawler(BaseCrawler): + NTO_RE = re.compile('newtelegraphonline.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.NTO_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(NewTeleOnlineCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#mvp-main-body-wrap h1.mvp-post-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('#mvp-main-body-wrap .mvp-author-info-date span.mvp-post-date time.post-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#mvp-main-body-wrap #mvp-content-main p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('#mvp-main-body-wrap .mvp-author-info-wrap .author-name a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/nigeriatoday.py b/dexter/processing/crawlers/nigeriatoday.py new file mode 100644 index 00000000..44bf21be --- /dev/null +++ b/dexter/processing/crawlers/nigeriatoday.py @@ -0,0 +1,38 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class NigeriaTodayCrawler(BaseCrawler): + NT_RE = re.compile('(www\.)?nigeriatoday.ng') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.NT_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(NigeriaTodayCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.post h1.title')) + + #gather publish date + meta_info = self.extract_plaintext(soup.select('.post .post-meta p')) + date = meta_info[meta_info.index(' on') + 3:meta_info.index(' in')].strip() + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.post > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/nta.py b/dexter/processing/crawlers/nta.py new file mode 100644 index 00000000..57a00314 --- /dev/null +++ b/dexter/processing/crawlers/nta.py @@ -0,0 +1,42 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class NTACrawler(BaseCrawler): + NTA_RE = re.compile('(www\.)?nta.ng') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.NTA_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(NTACrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.post-header h1.post-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.post-header .post-meta a.date span')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.post-body p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('.post-body .post-author .author-info h4 a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() + diff --git a/dexter/processing/crawlers/outrepreneurs.py b/dexter/processing/crawlers/outrepreneurs.py new file mode 100644 index 00000000..afe584fe --- /dev/null +++ b/dexter/processing/crawlers/outrepreneurs.py @@ -0,0 +1,58 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class OutrepreneursCrawler(BaseCrawler): + O_RE = re.compile('outrepreneurs.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.O_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(OutrepreneursCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.block-content .single-post-box .title-post h1')) + + byline = soup.select('.block-content .single-post-box .title-post .post-tags li') + + #gather publish date + date = byline[0].text.strip() + doc.published_at = self.parse_timestamp(date) + + # gather author + author = self.extract_plaintext(byline[1].select('a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() + + #gather text and summary + nodes = soup.select('.block-content .single-post-box .the-content > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes[:-1]) + diff --git a/dexter/processing/crawlers/planintl.py b/dexter/processing/crawlers/planintl.py new file mode 100644 index 00000000..ed613aa8 --- /dev/null +++ b/dexter/processing/crawlers/planintl.py @@ -0,0 +1,64 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class PlanIntlCrawler(BaseCrawler): + PI_RE = re.compile('plan-international.org') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.PI_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(PlanIntlCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#main-content .header-breadcrumb .page-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('#main-content .content-middle .date-display-single')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#main-content .core-content') + text_list = [] + doc.summary = '' + for node in nodes: + for child in node.descendants: + if child.name in ['h3','p','li']: + text_list = text_list + [child] + if len(doc.summary) < 200: + doc.summary = "\n\n".join(p.text.strip() for p in text_list).strip() + doc.text = "\n\n".join(p.text.strip() for p in text_list).strip() + + # gather author + author_type_A = self.extract_plaintext(soup.select('#main-content .content-middle .article-meta .field-guest-authors .field-guest-author')) + author_type_B = self.extract_plaintext(soup.select('#main-content .content-middle .author-bio h2')) + author = author_type_A + author_type_B + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/rhodesunimathewblog.py b/dexter/processing/crawlers/rhodesunimathewblog.py new file mode 100644 index 00000000..d1bef85b --- /dev/null +++ b/dexter/processing/crawlers/rhodesunimathewblog.py @@ -0,0 +1,41 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class RhodesUniMathewBlogCrawler(BaseCrawler): + RUMB_RE = re.compile('mathewnyaungwa.blogspot.co.za') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.RUMB_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(RhodesUniMathewBlogCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.blog-posts .date-posts .post-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.blog-posts .date-header span')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.blog-posts .date-posts .post-body span') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('.blog-posts .post-footer a.g-profile')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/rsaparliament.py b/dexter/processing/crawlers/rsaparliament.py new file mode 100644 index 00000000..616bc526 --- /dev/null +++ b/dexter/processing/crawlers/rsaparliament.py @@ -0,0 +1,62 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class RSAParliamentCrawler(BaseCrawler): + RSAP_RE = re.compile('(www\.)?parliament.gov.za') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.RSAP_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(RSAParliamentCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.page .page-header h4')) + + + #gather text and summary + nodes = soup.select('.page #content .page-content p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes[:-1]) + + #gather publish date + date_pattern = re.compile('(?:\s|\w){1}(\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December) \d{4})(?:[\s\w]|$)') + if re.search(date_pattern, nodes[0].text) != None: + date = re.search(date_pattern, nodes[0].text).group(0) + if re.search(date_pattern, nodes[-1].text) != None: + date = re.search(date_pattern, nodes[-1].text).group(0) + doc.published_at = self.parse_timestamp(date.strip()) + + #gather author + author_pattern = re.compile('(?:By )?([\w ]*)\s*(?:\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December) \d{4})(?:\s|$)|(?:Name: )([\w ]+)') + reg_result = re.search(author_pattern, nodes[-1].text) + if reg_result != None: + author = reg_result.group(1) if reg_result.group(1) else reg_result.group(2) + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/savca.py b/dexter/processing/crawlers/savca.py new file mode 100644 index 00000000..255da889 --- /dev/null +++ b/dexter/processing/crawlers/savca.py @@ -0,0 +1,55 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class SAVCACrawler(BaseCrawler): + SAVCA_RE = re.compile('(www\.)?savca.co.za') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.SAVCA_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(SAVCACrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.fl-row-content-wrap .fl-module-heading h1.fl-heading')) + + #gather publish date + date = self.extract_plaintext(soup.select('.fl-row-content-wrap .fl-module-fl-post-info .fl-module-content .fl-post-info-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select(".fl-row-content-wrap .fl-col-content .fl-module-fl-post-content p") + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select(".fl-row-content-wrap .fl-module-fl-post-info .fl-module-content .fl-post-info-author a")) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/seedmagazine.py b/dexter/processing/crawlers/seedmagazine.py new file mode 100644 index 00000000..4369d8a0 --- /dev/null +++ b/dexter/processing/crawlers/seedmagazine.py @@ -0,0 +1,49 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class SeedMagazineCrawler(BaseCrawler): + SM_RE = re.compile('(www\.)?seedmagazine.co.ke') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.SM_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(SeedMagazineCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.content-wrapper h1.page-title')) + + #gather publish date + date = '' + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.content-wrapper .page-content > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = '' + entry_author = nodes[0].text + if 'By ' in entry_author: + if '\n' in entry_author: + author = entry_author[entry_author.index('By ') + 3:entry_author.index('\n')].strip() + else: + author = entry_author[entry_author.index('By ') + 3:].strip() + else: + author = self.extract_plaintext(soup.select('.content-wrapper .page-meta-wrapper .author a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/standardmediaktnnews.py b/dexter/processing/crawlers/standardmediaktnnews.py new file mode 100644 index 00000000..4ed851a4 --- /dev/null +++ b/dexter/processing/crawlers/standardmediaktnnews.py @@ -0,0 +1,56 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class StandardMediaKTNCrawler(BaseCrawler): + BDO_RE = re.compile('(www\.)?standardmedia.co.ke/ktnnews') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + paths = parts.path.split('/') + return bool(self.BDO_RE.match(parts.netloc + '/' + paths[1])) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(StandardMediaKTNCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.container .card .card-block .card-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.container .card .card-block .card-text')) + doc.published_at = self.parse_timestamp(date[date.index('|') + 1:].strip()) + + #gather text and summary + nodes = soup.select('.container .card .card-block p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('.container .card .card-text a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/sundiatapost.py b/dexter/processing/crawlers/sundiatapost.py new file mode 100644 index 00000000..303d666d --- /dev/null +++ b/dexter/processing/crawlers/sundiatapost.py @@ -0,0 +1,77 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +from datetime import datetime, timedelta +from dateutil.parser import parse + +class SundiataPostCrawler(BaseCrawler): + SP_RE = re.compile('sundiatapost.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.SP_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def parse_timestamp(self, ts): + if 'hour' in ts: + return datetime.now() - timedelta(hours = int(ts[:ts.index('hour') -1].strip())) + elif 'day' in ts: + return datetime.now() - timedelta(days = int(ts[:ts.index('day') -1].strip())) + elif 'week' in ts: + return datetime.now() - timedelta(weeks = int(ts[:ts.index('week') -1].strip())) + else: + return parse(ts) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(SundiataPostCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#main-content .post .post-inner .post-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('#main-content .post .post-inner .post-meta .tie-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#main-content .post .post-inner .entry p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = '' + entry_author = nodes[0].text + if 'By ' in entry_author: + if '\n' in entry_author: + author = entry_author[entry_author.index('By ') + 3:entry_author.index('\n')].strip() + else: + author = entry_author[entry_author.index('By ') + 3:].strip() + else: + author = self.extract_plaintext(soup.select('#main-content .post .post-inner .post-meta .post-meta-author a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() + diff --git a/dexter/processing/crawlers/sunnewsonline.py b/dexter/processing/crawlers/sunnewsonline.py new file mode 100644 index 00000000..d3c4b421 --- /dev/null +++ b/dexter/processing/crawlers/sunnewsonline.py @@ -0,0 +1,37 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class SunNewsOnlineCrawler(BaseCrawler): + SNO_RE = re.compile('sunnewsonline.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.SNO_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(SunNewsOnlineCrawler, self).extract(doc, raw_html) + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.post header h4.entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.post header span.entry-date')) + doc.published_at = self.parse_timestamp(date[1:].strip()) + + #gather text and summary + nodes = soup.select('.post .elements-box > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + doc.author = Author.unknown() + diff --git a/dexter/processing/crawlers/thebusinesspost.py b/dexter/processing/crawlers/thebusinesspost.py new file mode 100644 index 00000000..9b296017 --- /dev/null +++ b/dexter/processing/crawlers/thebusinesspost.py @@ -0,0 +1,60 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class TheBusinessPostCrawler(BaseCrawler): + TBP_RE = re.compile('(www\.)?thebusinesspost.ng') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.TBP_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(TheBusinessPostCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.article-container article.story .media-heading')) + + nodes = soup.select('.article-container article.story p') + + + #gather text and summary + doc.summary = "\n\n".join(p.text.strip() for p in nodes[3:4]) + doc.text = "\n\n".join(p.text.strip() for p in nodes[3:]) + + byline = nodes[2].text + + # gather author + author = byline[:byline.index(' | ')] + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() + + #gather publish date + sub_byline = byline[byline.index(' | ') + 2:] + date = sub_byline[:sub_byline.index(' | ')].strip() + doc.published_at = self.parse_timestamp(date) \ No newline at end of file diff --git a/dexter/processing/crawlers/theguardianuk.py b/dexter/processing/crawlers/theguardianuk.py new file mode 100644 index 00000000..35443584 --- /dev/null +++ b/dexter/processing/crawlers/theguardianuk.py @@ -0,0 +1,57 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class TheGuardianUKCrawler(BaseCrawler): + TGUK_RE = re.compile('(www\.)?theguardian.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.TGUK_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + raw_html = raw_html.encode("utf-8") + raw_html = unicode(raw_html, errors='ignore') + + super(TheGuardianUKCrawler, self).extract(doc, raw_html) + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#article .content__article-body .content__head h1.content__headline')) + + #gather publish date + date = self.extract_plaintext(soup.select('#article .content__article-body .content__head .content__dateline time.content__dateline-wpd')) + doc.published_at = self.parse_timestamp(date.replace('.', ':')) + + #gather text and summary + nodes = soup.select('#article .content__article-body > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('#article .content__article-body .content__head .byline a span')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/theinterview.py b/dexter/processing/crawlers/theinterview.py new file mode 100644 index 00000000..e7718d4d --- /dev/null +++ b/dexter/processing/crawlers/theinterview.py @@ -0,0 +1,60 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class TheInterviewCrawler(BaseCrawler): + TI_RE = re.compile('(www\.)?theinterview.com.ng') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.TI_RE.match(parts.netloc)) + + def fetch(self, url): + """ + Fetch and return the raw HTML for this url. + The return content is a unicode string. + """ + self.log.info("Fetching URL: " + url) + + headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'} + + r = requests.get(url, headers=headers, timeout=10) + # raise an HTTPError on badness + r.raise_for_status() + + # this decodes r.content using a guessed encoding + return r.text + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(TheInterviewCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.post header h2.sd-entry-title')) + + #gather concatenated author and date node + date_author = soup.select('.post header .sd-entry-meta ul li.sd-meta-author') + + #gather publish date + date = ''.join(date_author[0].find('i').next_siblings) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.post .sd-entry-content p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = ''.join(date_author[1].find('i').next_siblings) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/thenation.py b/dexter/processing/crawlers/thenation.py new file mode 100644 index 00000000..a3a31cb7 --- /dev/null +++ b/dexter/processing/crawlers/thenation.py @@ -0,0 +1,41 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class TheNationCrawler(BaseCrawler): + TN_RE = re.compile('thenationonlineng.net') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.TN_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(TheNationCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.single-post-main .post-single-title h1.entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.single-post-main .entry-meta .posted-on time.entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.single-post-main .single-post-content p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('.single-post-main .entry-meta .author a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/thenerveafrica.py b/dexter/processing/crawlers/thenerveafrica.py new file mode 100644 index 00000000..f8265447 --- /dev/null +++ b/dexter/processing/crawlers/thenerveafrica.py @@ -0,0 +1,41 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class TheNerveAfricaCrawler(BaseCrawler): + TNA_RE = re.compile('thenerveafrica.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.TNA_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(TheNerveAfricaCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('article.post .entry-content-container .entry-header h1.entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('article.post .entry-content-container .entry-meta .posted-on time.entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('article.post .entry-content-container .entry-content > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('article.post .entry-content-container .entry-meta .byline .author a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() diff --git a/dexter/processing/crawlers/thepoint.py b/dexter/processing/crawlers/thepoint.py new file mode 100644 index 00000000..58a75359 --- /dev/null +++ b/dexter/processing/crawlers/thepoint.py @@ -0,0 +1,44 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class ThePointCrawler(BaseCrawler): + TP_RE = re.compile('(www\.)?thepointng.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.TP_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(ThePointCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.td-main-content .td-post-header h1.entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.td-main-content .td-post-header .td-post-date time.entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.td-main-content .td-post-content p') + if len(nodes) <= 1: + doc.summary = ''.join(nodes[0].find('br').previous_siblings) + else: + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('.td-main-content .td-post-header .td-post-author-name a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/thisdaylive.py b/dexter/processing/crawlers/thisdaylive.py new file mode 100644 index 00000000..ffb84b0f --- /dev/null +++ b/dexter/processing/crawlers/thisdaylive.py @@ -0,0 +1,37 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class ThisDayLiveCrawler(BaseCrawler): + TDL_RE = re.compile('(www\.)?thisdaylive.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.TDL_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(ThisDayLiveCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('.post .td-post-header .td-post-title .entry-title')) + + #gather publish date + date = self.extract_plaintext(soup.select('.post .td-post-header .td-post-title time.entry-date')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('.post .td-post-content p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:3]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/washingtonpost.py b/dexter/processing/crawlers/washingtonpost.py new file mode 100644 index 00000000..bd9be0a2 --- /dev/null +++ b/dexter/processing/crawlers/washingtonpost.py @@ -0,0 +1,55 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class WashingtonPostCrawler(BaseCrawler): + WP_RE = re.compile('(www\.)?washingtonpost.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.WP_RE.match(parts.netloc)) + + def canonicalise_url(self, url): + """ Strip anchors, etc.""" + + # Needed to handle urls being recieved without protocol (http[s]://), check if it can be parsed first, then handle and re parse if there is no netloc found + if '//' not in url: + url = '%s%s' % ('https://', url) + + parts = urlparse(url) + + netloc = parts.netloc.strip(':80') + + # force http, strip trailing slash, anchors etc. + return urlunparse(['https', netloc, parts.path.rstrip('/') or '/', parts.params, parts.query, None]) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + super(WashingtonPostCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#article-topper #topper-headline-wrapper h1')) + + #gather publish date + date = self.extract_plaintext(soup.select('#main-content #article-body .pb-timestamp')) + doc.published_at = self.parse_timestamp(date) + + #gather text and summary + nodes = soup.select('#main-content #article-body article.paywall > p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:2]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('#main-content #article-body .pb-byline a')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/crawlers/worldstage.py b/dexter/processing/crawlers/worldstage.py new file mode 100644 index 00000000..9b2a7890 --- /dev/null +++ b/dexter/processing/crawlers/worldstage.py @@ -0,0 +1,44 @@ +from urlparse import urlparse, urlunparse +import re + +from bs4 import BeautifulSoup +import requests + +from .base import BaseCrawler +from ...models import Entity, Author, AuthorType + +class WorldStageCrawler(BaseCrawler): + WS_RE = re.compile('worldstagegroup.com') + + def offer(self, url): + """ Can this crawler process this URL? """ + parts = urlparse(url) + return bool(self.WS_RE.match(parts.netloc)) + + def extract(self, doc, raw_html): + """ Extract text and other things from the raw_html for this document. """ + raw_html = raw_html.encode("utf-8") + raw_html = unicode(raw_html, errors='ignore') + + super(WorldStageCrawler, self).extract(doc, raw_html) + + soup = BeautifulSoup(raw_html) + + # gather title + doc.title = self.extract_plaintext(soup.select('#headline .fnewstitle')) + + #gather publish date + date = self.extract_plaintext(soup.select('#headline .fndate')) + doc.published_at = self.parse_timestamp(date[:date.index('|')].rstrip()) + + #gather text and summary + nodes = soup.select('#headline .fnewssummary p') + doc.summary = "\n\n".join(p.text.strip() for p in nodes[:1]) + doc.text = "\n\n".join(p.text.strip() for p in nodes) + + # gather author + author = self.extract_plaintext(soup.select('#headline .catfnauthor')) + if author: + doc.author = Author.get_or_create(author.strip(), AuthorType.journalist()) + else: + doc.author = Author.unknown() \ No newline at end of file diff --git a/dexter/processing/document_processor.py b/dexter/processing/document_processor.py index 7a8c756f..0dd4ef7a 100644 --- a/dexter/processing/document_processor.py +++ b/dexter/processing/document_processor.py @@ -38,6 +38,7 @@ def __init__(self): PostZambiaCrawler(), TimesZambiaCrawler(), NationKECrawler(), + StandardMediaKTNCrawler(), StandardMediaCrawler(), TheStarKECrawler(), TheEastAfricanKECrawler(), @@ -48,10 +49,59 @@ def __init__(self): DWCrawler(), ChronicleZWCrawler(), BBCCrawler(), + HowWeMadeItInAfricaCrawler(), + SAVCACrawler(), + RhodesUniMathewBlogCrawler(), + WorldStageCrawler(), + ClassicFMCrawler(), + AFPCrawler(), + NaijaNewsCrawler(), + DailyTrustNPCrawler(), + NewTeleOnlineCrawler(), + ThePointCrawler(), + DailyTimesCrawler(), + TheNationCrawler(), + MediaMaxNetCrawler(), + LeadershipCrawler(), + TheInterviewCrawler(), + RSAParliamentCrawler(), + GuardianCrawler(), + NationalDailyNgCrawler(), + NTACrawler(), + ACDIVOCACrawler(), + ThisDayLiveCrawler(), + ChannelAfricaCrawler(), + NANCrawler(), + NigeriaTodayCrawler(), + BusinessDayOnlineCrawler(), + GlobalTimesCN(), + NationalMirrorCrawler(), + MonitorKECrawler(), + NewsvergeCrawler(), + SundiataPostCrawler(), + AgrilinksCrawler(), + BusinessDailyAfricaCrawler(), + TheBusinessPostCrawler(), + TheGuardianUKCrawler(), + IndependentNGCrawler(), + TheNerveAfricaCrawler(), + AmehNewsCrawler(), + SunNewsOnlineCrawler(), + SeedMagazineCrawler(), + HallmarkNewsCrawler(), + DestinyConnectCrawler(), + EconomistCrawler(), + WashingtonPostCrawler(), + AmaBhunganeCrawler(), + AfricaInvestorCrawler(), + OutrepreneursCrawler(), + CNBCAfricaCrawler(), + PlanIntlCrawler(), + BloombergCrawler(), # must come last GenericCrawler()] self.extractors = [ - AlchemyExtractor(), + # AlchemyExtractor(), CalaisExtractor(), SourcesExtractor(), PlacesExtractor()] @@ -170,7 +220,6 @@ def process_feed_item(self, item): if not url: self.log.info("URL could not be parsed, ignoring: %s" % url) return None - existing = Document.query.filter(Document.url == url).first() if existing: self.log.info("URL has already been processed, ignoring: %s" % url) @@ -179,7 +228,6 @@ def process_feed_item(self, item): if not self.newstools_crawler.offer(url): self.log.info("No medium for URL, ignoring: %s" % url) return - # this sets up basic info doc = self.newstools_crawler.crawl(item) try: @@ -190,7 +238,7 @@ def process_feed_item(self, item): raise ProcessingError("Error fetching document: %s" % (e,)) # is it sane? - # TODO: this breaks for isolezwe and other non-english media + # TODO: this breaks for isolezwe and other non-english media' if not doc.text or 'the' not in doc.text: self.log.info("Document %s doesn't have reasonable-looking text, ignoring: %s..." % (url, doc.text[0:100])) db.session.rollback() @@ -324,6 +372,7 @@ def __init__(self): PostZambiaCrawler(), TimesZambiaCrawler(), NationKECrawler(), + StandardMediaKTNCrawler(), StandardMediaCrawler(), TheStarKECrawler(), TheEastAfricanKECrawler(), @@ -334,6 +383,55 @@ def __init__(self): DWCrawler(), ChronicleZWCrawler(), BBCCrawler(), + HowWeMadeItInAfricaCrawler(), + SAVCACrawler(), + RhodesUniMathewBlogCrawler(), + WorldStageCrawler(), + ClassicFMCrawler(), + AFPCrawler(), + NaijaNewsCrawler(), + DailyTrustNPCrawler(), + NewTeleOnlineCrawler(), + ThePointCrawler(), + DailyTimesCrawler(), + TheNationCrawler(), + MediaMaxNetCrawler(), + LeadershipCrawler(), + TheInterviewCrawler(), + RSAParliamentCrawler(), + GuardianCrawler(), + NationalDailyNgCrawler(), + NTACrawler(), + ACDIVOCACrawler(), + ThisDayLiveCrawler(), + ChannelAfricaCrawler(), + NANCrawler(), + NigeriaTodayCrawler(), + BusinessDayOnlineCrawler(), + GlobalTimesCN(), + NationalMirrorCrawler(), + MonitorKECrawler(), + NewsvergeCrawler(), + SundiataPostCrawler(), + AgrilinksCrawler(), + BusinessDailyAfricaCrawler(), + TheBusinessPostCrawler(), + TheGuardianUKCrawler(), + IndependentNGCrawler(), + TheNerveAfricaCrawler(), + AmehNewsCrawler(), + SunNewsOnlineCrawler(), + SeedMagazineCrawler(), + HallmarkNewsCrawler(), + DestinyConnectCrawler(), + EconomistCrawler(), + WashingtonPostCrawler(), + AmaBhunganeCrawler(), + AfricaInvestorCrawler(), + OutrepreneursCrawler(), + CNBCAfricaCrawler(), + PlanIntlCrawler(), + BloombergCrawler(), # must come last GenericCrawler()] diff --git a/rebuild_db.py b/rebuild_db.py index a2b670a9..1231f558 100644 --- a/rebuild_db.py +++ b/rebuild_db.py @@ -1,5 +1,5 @@ -# from dexter.models import db -# from dexter.models.seeds import seed_db -# db.drop_all() -# db.create_all() -# seed_db(db) \ No newline at end of file +from dexter.models import db +from dexter.models.seeds import seed_db +db.drop_all() +db.create_all() +seed_db(db) \ No newline at end of file