#! python3 # noqa: E265
"""
Module in charge of analyzing raw offers from GeoRezo: extracting contract type,
place, etc. from title and abstract.
"""
# ###########################################################################
# ######### Libraries #############
# #################################
# Standard library
import logging
import re
# Django
from django.db import IntegrityError
# project modules
from elgeopaso.jobs.models import Contract, GeorezoRSS, Offer, Place, Source
from elgeopaso.utils import TextToolbelt
from .parsers import ContentParser, TitleParser
# ##############################################################################
# ########## Globals ###############
# ##################################
# logs
logger = logging.getLogger(__name__)
# timestamps format helpers
_regex_markups = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
# shortcuts
txt_toolbelt = TextToolbelt()
# ############################################################################
# ########## Classes ##############
# #################################
[docs]class GeorezoOfferAnalizer:
"""
Analyze last offers published on GeoRezo and stored in the main table.
"""
def __init__(
self,
li_offers_ids: list,
opt_contracts: bool = 1,
opt_places: bool = 1,
opt_technos: bool = 1,
opt_skills: bool = 1,
opt_words: bool = 1,
source="GEOREZO_RSS",
new: bool = 1,
):
"""
:param list li_offers_ids: IDs list of offers to process
:param bool opt_contracts: parse or not contracts types
:param bool opt_places: parse or not contracts places
:param bool opt_technos: parse or not contracts technologies
:param bool opt_skills: parse or not contracts jobs label
:param bool opt_words: parse or not contracts words
:param str source: set offers source
:param bool new: create or update offer
"""
# parameters
self.offers_ids = li_offers_ids
self.opt_contracts = opt_contracts
self.opt_places = opt_places
self.opt_technos = opt_technos
self.opt_skills = opt_skills
self.opt_words = opt_words
self.source = source
self.new = new
logger.debug("Launching analisis on {} offers.".format(len(self.offers_ids)))
super(GeorezoOfferAnalizer, self).__init__()
# MAIN METHOD ------------------------------------------------------------
[docs] def analisis(self):
"""Perform analisis on offers."""
# parse offers
for offer_id in self.offers_ids:
self.offer_id = offer_id
# chekcs if offer has already been added
if Offer.objects.filter(id_rss=offer_id).exists() and self.new:
logger.error("Offer RSS_ID already exists in DB: {}".format(offer_id))
continue
else:
logger.debug("launch analisis on : {}".format(self.offer_id))
pass
# get raw offer from georezo_rss table
raw_offer = GeorezoRSS.objects.get(id_rss=offer_id)
# -- Title analisis ----------------------
clean_title = txt_toolbelt.remove_html_markups(raw_offer.title)
title_parser = TitleParser(offer_id=offer_id, input_title=clean_title)
# determine contract type
contract_type = title_parser.parse_contract_type()
jobs_labels = title_parser.parse_jobs_positions()
place = title_parser.parse_place(mode=0)
# -- Content analisis ----------------------
clean_content = txt_toolbelt.remove_html_markups(raw_offer.content)
content_parser = ContentParser(
offer_id=offer_id, input_content=clean_content
)
technos = content_parser.parse_technology()
# add or update offer
if self.new:
# add new offer
clean_offer = Offer(
id_rss=offer_id,
raw_offer=raw_offer,
title=clean_title,
content=clean_content,
pub_date=raw_offer.pub_date,
contract=Contract.objects.get(abbrv=contract_type),
source=Source.objects.get(name=self.source),
place=Place.objects.get(name=place),
)
try:
clean_offer.save()
except IntegrityError as err_msg:
logger.error(
"Offer RSS_ID ({}) already exists in DB: {}".format(
offer_id, err_msg
)
)
continue
else:
clean_offer = Offer.objects.select_related().filter(id_rss=offer_id)
if not clean_offer.exists():
logger.info(
"Offer to update no longer exists and won't be created: {}".format(
offer_id
)
)
continue
else:
pass
clean_offer.update(
title=clean_title,
content=clean_content,
pub_date=raw_offer.pub_date,
contract=Contract.objects.get(abbrv=contract_type),
source=Source.objects.get(name=self.source),
place=Place.objects.get(name=place),
)
clean_offer = Offer.objects.select_related().get(id_rss=offer_id)
# associate ManyToMany relationships
clean_offer.technologies.set(technos)
clean_offer.jobs_positions.set(jobs_labels)
logger.debug("Offer analyzed and inserted jobs.offer: {}".format(offer_id))
# ############################################################################
# #### Stand alone program ########
# #################################
if __name__ == "__main__":
"""standalone execution."""
pass