run.sh

# dataset directory
dataset=datasets/arxiv

# text file name; one document per line
text_file=phrase_text.txt

# category name file; one category per line; each line contains "[category id] [category names]"
category_file=category_names.txt

# taxonomy file; one edge per line; each line contains "[parent id] [child id]"
taxo_file=taxonomy.txt

# taxonomy file in matrix form; generated by read_taxo.py
matrix_file=matrix_$taxo_file

# node level information file; generated by read_taxo.py
level_file=level_$taxo_file

if [ ! -f "$dataset/$matrix_file" ] || [ ! -f "$dataset/$level_file" ]; then
    python src/read_taxo.py --dataset $dataset --category_file $category_file --taxo_file $taxo_file
fi

tree_emb_file=tree_emb.txt

# word embedding output file name
word_emb_file=emb.txt

# word embedding dimension
word_dim=100

# local context window size
window_size=5

# minimum word count in corpus; words that appear less than this threshold will be discarded
min_count=5

# number of total iterations to run on the corpus
iter=10

# number of threads to be run in parallel
threads=20

# pretrained word embedding file
pretrain_emb=jose_100.txt

if [ ! -f "$pretrain_emb" ] && [ -f "jose_100.zip" ]; then
    echo "Unzipping downloaded pretrained embedding"
    unzip jose_100.zip && rm jose_100.zip
fi

cd ./src
make josh
cd ..

./src/josh -train ${dataset}/${text_file} \
	-category-file ${dataset}/${category_file} -matrix-file ${dataset}/${matrix_file} -level-file ${dataset}/${level_file} \
	-load-emb ${pretrain_emb} -res ${dataset}/res.txt -k 10 \
	-word-emb ${dataset}/${word_emb_file} -tree-emb ${dataset}/${tree_emb_file} \
	-size ${word_dim} -window ${window_size} -sample 1e-3 -word-margin 0.25 -cat-margin 0.9 \
	-alpha 0.025 -tree-period 128 -global-lambda 1.5 -lambda-cat 1.0 -lambda-tree 1.0 -negative 2 \
	-expand 1 -pretrain 2 \
	-min-count ${min_count} -iter ${iter} -threads ${threads}