webscraping: download of ANVUR list of journals in Python

Recently I had the task of creating a dataset of scientific journal classified by ANVUR (italian agency for research rating);
Unfortunately the lists are splitted by research area and available only in PDF at URL

In order to make my life easier I created a Python 3 script that downloads all PDFs and via Tabula library, transforms PDF tables into CSVs.

I put the script below (note URL is hardcoded, for future uses change it)

Still CSVs need some work due to multiline titles.
To make life easie I made 2 xls files with A journals and all journals, for Areas 11 12 13 and 14.

XLS files can be downloaded here.

# python 35

# pdf downloader code extractor

from bs4 import BeautifulSoup
import requests
import time
import codecs
import PyPDF2
import os
from tabula import read_pdf
import pandas as pd

if __name__ == "__main__":

    istable = input('Pdf are tables?[N]') or 'N'
    dnl = input('download PDFs?[Y]') or 'Y'

    if dnl=="Y":
        archive_url = ""
        response = requests.get(archive_url)

        soup = BeautifulSoup(response.text, 'html.parser')

        pdf_links = [link['href'] for link in soup.find_all('a') if link['href'].endswith('pdf')]

        for link in pdf_links:

            if link[:4]!='http':
                link = archive_url + link

            '''iterate through all links in and download them one by one'''

            # obtain filename by splitting url and getting
            # last string
            file_name = link.split('/')[-1]

            print ("Downloading file:%s" % file_name)

            # create response object
            r = requests.get(link, stream=True)

            # download started
            with open(file_name, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024 * 1024):
                    if chunk:

            print ("%s downloaded!\n" % file_name)

        print ("All file downloaded!")

    pdfDir = ""
    txtDir = ""

    if pdfDir == "": pdfDir = os.getcwd() + "\\"  # if no pdfDir passed in
    if txtDir == "": txtDir = os.getcwd() + "\\"  # if no txtDir passed in

    for pdf_to_read in os.listdir(pdfDir):  # iterate through pdfs in pdf directory
        fileExtension = pdf_to_read.split(".")[-1]  # -1 takes always last part
        if fileExtension == "pdf":
            pdfFilename = pdfDir + pdf_to_read
            textFilename = txtDir + pdf_to_read + ".txt"
            textFile = open(textFilename, "a")  # make text file

            if istable == 'N':

                pdf = PyPDF2.PdfFileReader(open(pdfFilename, "rb"))
                for page in pdf.pages:
                    textFile.write(page.extractText())  # write text to text file

                df= read_pdf(pdfFilename, pages="all")


