version 1

2022-02-19 19:59:36 +03:00 · 2021-11-18 18:53:25 -08:00
commit 30fee3a613
9 changed files with 596 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,273 @@
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+### PyCharm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### PyCharm Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+# Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+.idea/**/sonarlint/
+
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+.idea/**/sonarIssues.xml
+
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+.idea/**/markdown-navigator.xml
+.idea/**/markdown-navigator-enh.xml
+.idea/**/markdown-navigator/
+
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+.idea/$CACHE_FILE$
+
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+.idea/codestream.xml
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 David Hay
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,5 @@
+# LinkedIn Job Report Creator
+
+This program creates a PDF including the origional post with graphs of the 
+most used language from scraping a LinkedIn job post using Python.
+
--- a/creator/init.py
+++ b/creator/init.py
@@ -0,0 +1 @@
+from .creator import PullJob
--- a/creator/creator.py
+++ b/creator/creator.py
@@ -0,0 +1,171 @@
+import os
+import glob
+from collections import Counter
+from datetime import date
+
+import spacy
+from bs4 import BeautifulSoup
+import matplotlib.pyplot as plt
+from playwright.sync_api import sync_playwright
+
+from .pdf import PDF
+
+
+class PullJob:
+    def __init__(self, url, file, noun_num, verb_num):
+        self.url = url
+        self.file = file
+        self.noun_num = noun_num
+        self.verb_num = verb_num
+        self.body = []
+        self.body_string = ''
+        self.title = ''
+        self.company = ''
+        self.city = ''
+        self.common_nouns = []
+        self.common_verbs = []
+        self.common_list = []
+        self.cwd = os.getcwd() + '/docs/*'
+
+    def scrape(self):
+        print('downloading html...')
+        with sync_playwright() as p:
+            browser = p.chromium.launch()
+            page = browser.new_page()
+            page.goto(self.url)
+            content = page.content()
+            with open(f'docs/{self.file}.html', 'a') as f:
+                f.write(content)
+                f.close()
+            browser.close()
+
+    def get_file(self):
+        print('parsing file...')
+        body = []  # List of strings from body of post
+        with open(f"docs/{self.file}.html") as fp:
+            soup = BeautifulSoup(fp, 'html.parser')
+            self.title = soup.find('h1').string.replace("\n", "").strip()
+            self.company = soup.find(
+                'a', class_='topcard__org-name-link '
+                            'topcard__flavor--black-link'
+            ).text.replace("\n", "").strip()
+            self.city = soup.find(
+                'span', class_='topcard__flavor '
+                               'topcard__flavor--bullet'
+            ).string.replace("\n", "").strip()
+        content = soup.find(
+            'div', class_='show-more-less-html__markup '
+                          'show-more-less-html__markup--clamp-after-5')
+        strong_soup = content.find_all('strong')
+        li_soup = content.find_all('li')
+        strong_list = []
+        li_list = []
+        for x in strong_soup:
+            strong_list.append(x.text)
+        for x in li_soup:
+            li_list.append(x.text)
+        for x in iter(content.stripped_strings):
+            if x in strong_list:
+                body.append(' ')
+                body.append(x)
+            elif x in li_list:
+                body.append(' - ' + x)
+            else:
+                body.append(' ')
+                body.append(x)
+        fp.close()
+        self.body = body
+        self.write_text()
+
+    def write_text(self):
+        body_string = ''
+        with open(f"docs/{self.file}.txt", 'w') as fp:
+            for x in iter(self.body):
+                body_string = body_string + x
+                fp.write(x + '\n')
+            fp.close()
+        self.body_string = body_string
+        self.frequency()
+
+    def frequency(self):
+        nlp = spacy.load('en_core_web_sm')
+        doc = nlp(self.body_string)
+
+        nouns = [token.text
+                 for token in doc
+                 if (not token.is_stop and
+                     not token.is_punct and
+                     token.pos_ == "NOUN")]
+
+        verbs = [token.text
+                 for token in doc
+                 if (not token.is_stop and
+                     not token.is_punct and
+                     token.pos_ == "VERB")]
+
+        noun_freq = Counter(nouns)
+        verb_freq = Counter(verbs)
+
+        self.common_nouns = noun_freq.most_common(self.noun_num)
+        self.common_verbs = verb_freq.most_common(self.verb_num)
+        self.common_list = self.common_nouns + self.common_verbs
+
+        self.horizontal_chart()
+
+    def horizontal_chart(self):
+        word_list, word_occurance = zip(*self.common_list)
+        plt.figure(0)  # Specify differnt figures
+        plt.rcdefaults()
+        plt.barh(word_list, word_occurance)
+        plt.title('Most common words')
+        plt.ylabel('Word')
+        plt.xlabel('Occurance')
+        plt.tight_layout()  # add padding
+        plt.savefig(f'docs/{self.file}.png')
+
+    def verb_pie_chart(self):
+        verb_list, verb_occurance = zip(*self.common_verbs)
+        plt.figure()  # Specify differnt figures
+        plt.title('Most common Verbs')
+        plt.pie(
+            verb_occurance, labels=verb_list, autopct='%1.1f%%', shadow=True,
+            startangle=90)
+        plt.savefig(f'docs/verb_{self.file}.png')
+
+    def noun_pie_chart(self):
+        noun_list, noun_occurance = zip(*self.common_nouns)
+        plt.figure()  # Specify differnt figures
+        plt.title('Most common Nouns')
+        plt.pie(
+            noun_occurance, labels=noun_list, autopct='%1.1f%%', shadow=True,
+            startangle=90)
+        plt.savefig(f'docs/noun_{self.file}.png')
+
+    def create_PDF(self):
+        print('creating report...')
+        today = date.today().strftime('%m/%d/%y')
+        # Importing PDF class made in report.py
+        pdf = PDF()
+        pdf.set_title(f'{self.title} Job Report')
+        pdf.set_author('David Hay')
+        pdf.print_job(
+            today, self.title, self.company, self.city,
+            f'docs/{self.file}.txt', self.file)
+        pdf.output(f'report/{self.file}.pdf')
+
+    def delete_files(self):
+        all_docs = glob.glob(self.cwd)
+        for f in all_docs:
+            os.remove(f)
+
+    def run_all(self):
+        self.scrape()
+        self.get_file()
+        self.write_text()
+        self.frequency()
+        self.horizontal_chart()
+        self.verb_pie_chart()
+        self.noun_pie_chart()
+        self.create_PDF()
+        self.delete_files()  # If you want to keep other files take out line
+        print('Done!')
--- a/creator/pdf.py
+++ b/creator/pdf.py
@@ -0,0 +1,55 @@
+from fpdf import FPDF
+
+"""
+d = today
+t = title
+c = company
+lo = location
+xp = text filepath
+n = file name
+"""
+
+
+class PDF(FPDF):
+    def header(self):
+        # Setting font: helvetica bold 15
+        self.set_font("helvetica", "B", 15)
+
+    def footer(self):
+        # Setting position at 1.5 cm from bottom:
+        self.set_y(-15)
+        # Setting font: helvetica italic 8
+        self.set_font("helvetica", "I", 8)
+        # Setting text color to gray:
+        self.set_text_color(128)
+        # Printing page number
+        self.cell(0, 10, f"Page {self.page_no()}", 0, 0, "C")
+
+    def job_title(self, d, t, c, lo):
+        self.set_font("helvetica", "B", 8)
+        self.cell(0, 6, f'Date: {d}', 0, 1, 'R')
+        self.ln(1)
+        self.set_font("helvetica", "B", 15)
+        self.cell(0, 6, f'Role: {t}', 0, 1, 'C')
+        self.ln(4)
+        self.cell(0, 6, f'Company: {c}', 0, 1, 'C')
+        self.ln(4)
+        self.cell(0, 6, f'Location: {lo}', 0, 1, 'C')
+        self.ln(4)
+
+    def job_body(self, xp, n):
+        with open(xp, "rb") as fp:
+            txt = fp.read().decode("latin-1")
+        self.set_font('helvetica', size=12)
+        self.multi_cell(0, 7, txt)
+        self.ln()
+        self.image(f'docs/{n}.png', x=27, w=150)
+        self.image(f'docs/verb_{n}.png', x=27, w=150)
+        self.image(f'docs/noun_{n}.png', x=27, w=150)
+
+    def print_job(self, d, t, c, lo, xp, n):
+        self.add_page()
+        self.job_title(d, t, c, lo)
+        self.job_body(xp, n)
+
+
--- a/main.py
+++ b/main.py
@@ -0,0 +1,12 @@
+from creator import PullJob
+
+# For Test name
+url = input('Enter URL')
+file = input('job')
+nouns = 10
+verbs = 10
+
+pull = PullJob(url, file, nouns, verbs)
+
+if __name__ == '__main__':
+    pull.run_all()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,46 @@
+beautifulsoup4==4.10.0
+blis==0.7.5
+catalogue==2.0.6
+certifi==2021.10.8
+charset-normalizer==2.0.7
+click==8.0.3
+cycler==0.11.0
+cymem==2.0.6
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
+fonttools==4.28.1
+fpdf==1.7.2
+greenlet==1.1.2
+idna==3.3
+Jinja2==3.0.3
+kiwisolver==1.3.2
+langcodes==3.3.0
+MarkupSafe==2.0.1
+matplotlib==3.5.0
+murmurhash==1.0.6
+numpy==1.21.4
+packaging==21.2
+pathy==0.6.1
+Pillow==8.4.0
+playwright==1.16.1
+preshed==3.0.6
+pydantic==1.8.2
+pyee==8.2.2
+pyparsing==2.4.7
+python-dateutil==2.8.2
+requests==2.26.0
+setuptools-scm==6.3.2
+six==1.16.0
+smart-open==5.2.1
+soupsieve==2.3.1
+spacy==3.2.0
+spacy-legacy==3.0.8
+spacy-loggers==1.0.1
+srsly==2.4.2
+thinc==8.0.13
+tomli==1.2.2
+tqdm==4.62.3
+typer==0.4.0
+typing_extensions==4.0.0
+urllib3==1.26.7
+wasabi==0.8.2
+websockets==10.1
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,12 @@
+from distutils.core import setup
+
+setup(
+    name='creator',
+    version='1.0',
+    packages=['creator'],
+    url='https://github.com/dave-hay/linkedin-job-report-creator',
+    license='MIT License',
+    author='David Hay',
+    author_email='davehay93@gmail.com',
+    description='Makes a graphical PDF report from LinkedIn job posts.'
+)