# Copyright 2020 Gentoo Authors # Distributed under the terms of the GNU General Public License v2 EAPI=7 inherit check-reqs DESCRIPTION="Data files for NLTK" HOMEPAGE="https://www.nltk.org/nltk_data/" # at least some of the files have poorly documented licenses # TODO: create a USE flag for free-ish subset LICENSE="all-rights-reserved" SLOT="0" KEYWORDS="~amd64 ~x86" IUSE="extra" RESTRICT="bindist mirror" BDEPEND="app-arch/unzip" PACKAGES_ZIP=( # wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=0]' -v @subdir -o "/" -v @id -n - | sort corpora/comtrans corpora/conll2007 corpora/jeita corpora/knbc corpora/machado corpora/masc_tagged corpora/nombank.1.0 corpora/panlex_swadesh corpora/propbank corpora/reuters corpora/semcor corpora/universal_treebanks_v20 sentiment/vader_lexicon stemmers/snowball_data ) PACKAGES_UNPACK=( # wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=1]' -v @subdir -o "/" -v @id -n - | sort corpora/abc corpora/alpino corpora/brown corpora/cess_cat corpora/cess_esp corpora/chat80 corpora/city_database corpora/cmudict corpora/comparative_sentences corpora/conll2000 corpora/conll2002 corpora/crubadan corpora/dependency_treebank corpora/dolch corpora/europarl_raw corpora/floresta corpora/framenet_v15 corpora/framenet_v17 corpora/gazetteers corpora/genesis corpora/gutenberg corpora/ieer corpora/inaugural corpora/indian corpora/lin_thesaurus corpora/mac_morpho corpora/movie_reviews corpora/mte_teip5 corpora/names corpora/nonbreaking_prefixes corpora/nps_chat corpora/omw corpora/opinion_lexicon corpora/pl196x corpora/ppattach corpora/product_reviews_1 corpora/product_reviews_2 corpora/pros_cons corpora/ptb corpora/qc corpora/rte corpora/senseval corpora/sentence_polarity corpora/sentiwordnet corpora/shakespeare corpora/sinica_treebank corpora/state_union corpora/stopwords corpora/subjectivity corpora/swadesh corpora/switchboard corpora/timit corpora/toolbox corpora/treebank corpora/twitter_samples corpora/udhr corpora/udhr2 corpora/verbnet corpora/webtext corpora/wordnet corpora/wordnet_ic corpora/words grammars/book_grammars grammars/large_grammars grammars/sample_grammars misc/perluniprops models/bllip_wsj_no_aux models/moses_sample models/wmt15_eval models/word2vec_sample stemmers/porter_test stemmers/rslp taggers/averaged_perceptron_tagger taggers/averaged_perceptron_tagger_ru taggers/universal_tagset tokenizers/punkt ) PACKAGES_UNPACK_EXTRA=( chunkers/maxent_ne_chunker corpora/biocreative_ppi corpora/brown_tei corpora/kimmo corpora/paradigms corpora/pe08 corpora/pil corpora/problem_reports corpora/smultron corpora/unicode_samples corpora/verbnet3 corpora/ycoe grammars/basque_grammars grammars/spanish_grammars help/tagsets misc/mwa_ppdb taggers/maxent_treebank_pos_tagger ) add_data() { local x for x; do SRC_URI+=" https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${x}.zip -> nltk-${x#*/}-${PV}.zip" done } add_data "${PACKAGES_ZIP[@]}" "${PACKAGES_UNPACK[@]}" SRC_URI+=" extra? (" add_data "${PACKAGES_UNPACK_EXTRA[@]}" SRC_URI+=" )" CHECKREQS_DISK_USR=3G CHECKREQS_DISK_BUILD=${CHECKREQS_DISK_USR} src_unpack() { local x local to_unpack=( "${PACKAGES_UNPACK[@]}" ) use extra && to_unpack+=( "${PACKAGES_UNPACK_EXTRA[@]}" ) for x in "${to_unpack[@]}"; do local cat=${x%/*} local pkg=${x#*/} mkdir -p "${S}/${cat}" || die cd "${S}/${cat}" || die unpack "nltk-${pkg}-${PV}.zip" done } src_install() { dodir /usr/share/nltk_data mv * "${ED}/usr/share/nltk_data/" || die local x for x in "${PACKAGES_ZIP[@]}"; do local cat=${x%/*} local pkg=${x#*/} insinto "/usr/share/nltk_data/${cat}" newins "${DISTDIR}/nltk-${pkg}-${PV}.zip" "${pkg}.zip" done }