Recently I had the task of creating a dataset of scientific journal classified by ANVUR (italian agency for research rating);
Unfortunately the lists are splitted by research area and available only in PDF at URL
http://www.anvur.it/attivita/classificazione-delle-riviste/classificazione-delle-riviste-ai-fini-dellabilitazione-scientifica-nazionale/elenchi-di-riviste-scientifiche-e-di-classe-a/
In order to make my life easier I created a Python 3 script that downloads all PDFs and via Tabula library, transforms PDF tables into CSVs.
I put the script below (note URL is hardcoded, for future uses change it)
Still CSVs need some work due to multiline titles.
To make life easie I made 2 xls files with A journals and all journals, for Areas 11 12 13 and 14.
XLS files can be downloaded here.
# python 35
# pdf downloader code extractor
from bs4 import BeautifulSoup
import requests
import time
import codecs
import PyPDF2
import os
from tabula import read_pdf
import pandas as pd
if __name__ == "__main__":
istable = input('Pdf are tables?[N]') or 'N'
dnl = input('download PDFs?[Y]') or 'Y'
if dnl=="Y":
archive_url = "http://www.anvur.it/attivita/classificazione-delle-riviste/classificazione-delle-riviste-ai-fini-dellabilitazione-scientifica-nazionale/elenchi-di-riviste-scientifiche-e-di-classe-a/"
response = requests.get(archive_url)
soup = BeautifulSoup(response.text, 'html.parser')
pdf_links = [link['href'] for link in soup.find_all('a') if link['href'].endswith('pdf')]
for link in pdf_links:
if link[:4]!='http':
link = archive_url + link
'''iterate through all links in and download them one by one'''
# obtain filename by splitting url and getting
# last string
file_name = link.split('/')[-1]
print ("Downloading file:%s" % file_name)
# create response object
r = requests.get(link, stream=True)
# download started
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
if chunk:
f.write(chunk)
print ("%s downloaded!\n" % file_name)
print ("All file downloaded!")
pdfDir = ""
txtDir = ""
if pdfDir == "": pdfDir = os.getcwd() + "\\" # if no pdfDir passed in
if txtDir == "": txtDir = os.getcwd() + "\\" # if no txtDir passed in
for pdf_to_read in os.listdir(pdfDir): # iterate through pdfs in pdf directory
fileExtension = pdf_to_read.split(".")[-1] # -1 takes always last part
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf_to_read
textFilename = txtDir + pdf_to_read + ".txt"
textFile = open(textFilename, "a") # make text file
if istable == 'N':
pdf = PyPDF2.PdfFileReader(open(pdfFilename, "rb"))
for page in pdf.pages:
textFile.write(page.extractText()) # write text to text file
else:
df= read_pdf(pdfFilename, pages="all")
df.to_csv(textFilename)
textFile.close()
No comments:
Post a Comment