Python - TDoc

Table of Contents

Introduction

In order to facilitate the tdoc. related processing, I wrote a simple module using Python, termed tdoc, based on

  • Builtin modules, os, functools, multiprocessing, argparse; and
  • 3rd party libraries, pandas, pybtex, urllib.

Features

  • It takes a tdoc. list file, usually has a xlsx extension name, as input.
  • It optionally filters the target tdocs. with agenda item, title, and/or author (e.g., company).
  • It can download tdocs. interested from 3GPP using multithreading (with configurable thread number) to a specified directory.
  • It can build a database based on the tdocs. after necessary filtering, and export it to a bib file.

Syntax

Its syntax can be listed as follows by run python tdoc.py --help.

usage: tdoc.py [-h] [--remote_dir REMOTE_DIR]
               [--output_dir OUTPUT_DIR] [--agenda_item AGENDA]
               [--title TITLE] [--author AUTHOR]
               [--bibtex_file BIBTEX_FILE]
               [--thread_num THREAD_NUM]
               tdoc_list

positional arguments:
  tdoc_list

options:
  -h, --help show this help message and exit
  --remote_dir REMOTE_DIR
  --output_dir OUTPUT_DIR
  --agenda_item AGENDA
  --title TITLE
  --author AUTHOR
  --bibtex_file BIBTEX_FILE
  --thread_num THREAD_NUM

Source code

import os
import pandas as pd
from pybtex.database import BibliographyData, Entry
from urllib.request import urlretrieve
from functools import partial
from multiprocessing import Pool
from argparse import ArgumentParser, Namespace

class Temporary_Document:
    def __init__(self, fn: str, remote_dir: str) -> None:
        """! Constructor
        @param fn File name of the tdoc list, i.e., an MS excel file.
        @param remote_dir Remote storage for tdoc(s)
        """
        self.df = pd.read_excel(fn, usecols=[0, 1, 2, 10, 13], skiprows=1, names=['id', 'title', 'author', 'agenda', 'status'], index_col=0, dtype=str)
        self.df['author'].replace({'.*Qualcomm.*':'QC', \
                                   '.*Nokia.*':'Nokia', \
                                   '.*DOCOMO.*':'DOCOMO', \
                                   '.*Huawei.*':'Huawei', \
                                   '.*ZTE.*':'ZTE', \
                                   '.*InterDigital.*':'InterDigital', \
                                   '.*MediaTek.*':'MTK'}, regex=True, inplace=True)
        self.df = self.df[self.df['status'] == 'available']
        self.remote_dir = remote_dir
    def filter_agenda(self, ai: str) -> None:
        """! Filter the entries using agenda item
        @param ai Agenda item
        """
        self.df = self.df[self.df['agenda'] == ai]
    def filter_title(self, title: str) -> None:
        """! Filter the entries using title
        @param title A substring to match
        """
        self.df = self.df[self.df['title'].str.contains(title, case=False)]
    def filter_author(self, author: str) -> None:
        """! Filter the entries using author (company name)
        @param author Author (company name)
        """
        self.df = self.df[self.df['author'] == author]
    def gen_db(self) -> None:
        """! Build the database based on the entries."""
        self.db = BibliographyData()
        for tdoc in self.df.itertuples():
            print(tdoc.Index)
            self.db.add_entry(tdoc.Index, Entry('article', {'author': tdoc.author, 'title': tdoc.title, 'agenda': tdoc.agenda}))
    def gen_bib(self, fn: str) -> None:
        """! Generate bibtex file for further reference.
        @param fn The name of bibtex file
        """
        self.db.to_file(fn, bib_format='bibtex')
    def download_parallel(self, output_dir: str, thread_num: int) -> None:
        """! Download the entries to a specified directory in a parallel way.
        @param output_dir Directory to store the tdocs
        @param thread_num Thread number
        """
        pfunc = partial(download_tdoc, self.remote_dir, output_dir)
        p = Pool(thread_num)
        p.map(pfunc, self.df.index.tolist())
        p.close()
        p.join()

def download_tdoc(remote_dir: str, local_dir: str, id: str) -> None:
    """! Download a tdoc.
    @param remote_dir Remote directory of the tdoc.
    @param local_dir Local directory for the tdoc.
    @param id Tdoc. number
    """
    fn_remote = os.path.join(remote_dir, f'{id}.zip')
    fn_local = os.path.join(local_dir, f'{id}.zip')
    if os.path.exists(fn_local):
        print(f'{id} exists already, skipping ...')
    else:
        urlretrieve(fn_remote, fn_local)
        print(f'{id} downloaded.')

def main(args: Namespace) -> None:
    """! Main loop
    @args Arguments
    """
    t = Temporary_Document(args.tdoc_list, args.remote_dir)
    if args.agenda is not None:
        t.filter_agenda(args.agenda)
    if args.title is not None:
        t.filter_title(args.title)
    if args.author is not None:
        t.filter_author(args.author)
    t.gen_db()
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    t.gen_bib(os.path.join(args.output_dir, args.bibtex_file))
    t.download_parallel(args.output_dir, args.thread_num)

if __name__ == "__main__":
    p = ArgumentParser()
    p.add_argument('tdoc_list', type=str)
    p.add_argument('--remote_dir', type=str, dest='remote_dir', default=None)
    p.add_argument('--output_dir', type=str, dest='output_dir', default='.')
    p.add_argument('--agenda_item', type=str, dest='agenda', default=None)
    p.add_argument('--title', type=str, dest='title', default=None)
    p.add_argument('--author', type=str, dest='author', default=None)
    p.add_argument('--bibtex_file', type=str, dest='bibtex_file', default='tdoc.bib')
    p.add_argument('--thread_num', type=int, dest='thread_num', default=1)
    main(p.parse_args())