Merge pull request #1208 from microsoft/kdd2020_tutorial_updated

Kdd2020 tutorial updated
recommenders-team · Sep 25, 2020 · a798694 · a798694
2 parents d32623c + 2d7249d
commit a798694
Show file tree

Hide file tree

Showing 25 changed files with 5,913 additions and 21 deletions.
diff --git a/.gitignore b/.gitignore
@@ -156,6 +156,14 @@ ml-20m/
 *.model
 *.mml
 nohup.out
+
+#####  kdd 2020 tutorial data folder
+scenarios/KDD2020-tutorial/data_folder/
+scenarios/academic/KDD2020-tutorial/data_folder/
+examples/07_tutorials/KDD2020-tutorial/data_folder/
+
 *.vec
 *.tsv
 *.sh
+
+tests/resources/
diff --git a/examples/00_quick_start/dkn_MIND.ipynb b/examples/00_quick_start/dkn_MIND.ipynb
@@ -390,12 +390,19 @@
     "\\[3\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html <br>\n",
     "\\[4\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "Python (reco_gpu)",
+   "display_name": "reco_gpu",
    "language": "python",
    "name": "reco_gpu"
   },
@@ -409,7 +416,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.10"
+   "version": "3.6.8"
   },
   "pycharm": {
    "stem_cell": {

diff --git a/examples/02_model_collaborative_filtering/lightgcn_deep_dive.ipynb b/examples/02_model_collaborative_filtering/lightgcn_deep_dive.ipynb
@@ -798,12 +798,19 @@
     "\n",
     "2. LightGCN implementation [TensorFlow]: https://github.com/kuandeng/lightgcn"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.5",
    "language": "python",
    "name": "python3"
   },
@@ -817,7 +824,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.5.6"
   }
  },
  "nbformat": 4,

diff --git a/examples/07_tutorials/KDD2020-tutorial/README.md b/examples/07_tutorials/KDD2020-tutorial/README.md
@@ -0,0 +1,46 @@
+# Environment setup
+The following setup instructions assume users work in a Linux system. The testing was performed on a Ubuntu Linux system.
+We use Conda to install packages and manage the virtual environment. Type ``` conda list ``` to check if you have conda in your machine. If not, please follow the instructions on https://conda.io/projects/conda/en/latest/user-guide/install/linux.html to install either Miniconda or Anaconda (preferred) before we proceed. 
+
+1. Clone the repository
+    ```bash
+    git clone https://github.com/microsoft/recommenders 
+    ```
+
+1. Navigate to the tutorial folder. The materials for the tutorial are located under the directory of `recommenders/examples/07_tutorials/KDD2020-tutorial`.
+    ```bash
+    cd recommenders/examples/07_tutorials/KDD2020-tutorial
+    ```
+1. Download the dataset
+    1. Download the dataset for hands on experiments and unzip to data_folder:
+    ```bash
+    wget https://recodatasets.blob.core.windows.net/kdd2020/data_folder.zip
+    unzip data_folder.zip -d data_folder
+    ```
+    After you unzip the file, there are two folders under data_folder, i.e. 'raw' and 'my_cached'.   'raw' folder contains original txt files from the COVID MAG dataset. 'my_cached' folder contains processed data files, if you miss some steps during the hands-on tutorial, you can make it up by copying corresponding files into experiment folders.
+1. Install the dependencies
+    1. The model pre-training will use a tool for converting the original data into embeddings. Use of the tool will require `g++`. The following installs `g++` on a Linux system.
+        ```bash
+        sudo apt-get install g++
+        ```
+    1. The Python script will be run in a conda environment where the dependencies are installed. This can be done by using the `reco_gpu_kdd.yaml` file provided in the branch subfolder with the following commands.
+        ```bash
+        conda env create -n kdd_tutorial_2020 -f reco_gpu_kdd.yaml
+        conda activate kdd_tutorial_2020
+        ```
+1. The tutorial will be conducated by using the Jupyter notebooks. The newly created conda kernel can be registered with the Jupyter notebook server
+    ```bash
+    python -m ipykernel install --user --name kdd_tutorial_2020 --display-name "Python (kdd tutorial)"
+    ```
+
+# Tutorial notebooks/scripts
+After the setup, the users should be able to launch the notebooks locally with the command 
+```bash
+jupyter notebook --port=8080
+```
+Then the notebook can be spinned off in a browser at the address of `localhost:8080`.
+Alternatively, if the jupyter notebook server is on a remote server, the users can launch the jupyter notebook by using the following command.
+```bash
+jupyter notebook --no-browser --ip=10.214.70.89 --port=8080
+```
+From the local browser, the notebook can be spinned off at the address of `10.214.70.89:8080`.
diff --git a/examples/07_tutorials/KDD2020-tutorial/dkn.yaml b/examples/07_tutorials/KDD2020-tutorial/dkn.yaml
@@ -0,0 +1,61 @@
+data:
+  doc_size: 15  # Each feature length should be fixed at doc_size, if the number of words in document is more than doc_size, you should truncate the document to doc_size words, and if the number of words in document is less than doc_size, you should padding 0. 
+  his_size: 20  # Max number of user click history, we will automatically keep the last his_size number of user click history, if users' click history is more than his_size, and we will automatically padding 0 if less than his_size.
+  word_size: 194755  # word vocabulary size
+  entity_size: 57267 # entity vocabulary size
+  data_format: dkn
+
+info:
+  metrics:
+  - auc
+  pairwise_metrics:
+  - group_auc
+  - mean_mrr
+  - ndcg@2;4;6
+  show_step: 10000  # print loss every show_step batches
+
+model:
+  method : classification
+  activation:
+  - sigmoid
+  attention_activation: relu
+  attention_dropout: 0.0
+  attention_layer_sizes: 32
+  dim: 32  # word embedding dim
+  use_entity: true # use entity embedding
+  use_context: true # use context embedding
+
+  entity_dim: 32 # entity embedding dim
+  entity_embedding_method: TransE
+  transform: true # add a transform layer for entity and context embeddings
+
+  dropout:
+  - 0.0
+  filter_sizes: # window size of kcnn filters
+  - 1
+  - 2
+  - 3
+  layer_sizes:  # layer size for final prediction score layer
+  - 300
+  # model_type: DKN_without_context
+  model_type: dkn
+  num_filters: 50   # number of filter for each filter_size in kcnn part
+  infer_model_name : epoch_2
+
+train:
+  batch_size: 100
+  embed_l1: 0.000
+  embed_l2: 0.000001
+  epochs: 50
+  init_method: uniform
+  init_value: 0.01
+  layer_l1: 0.000
+  layer_l2: 0.000001
+  learning_rate: 0.00005
+  loss: log_loss
+  optimizer: adam
+  save_model: True
+  save_epoch : 1 # save model every save_epoch epochs
+  enable_BN : False
+  is_clip_norm: False
+  max_grad_norm: 0.5
diff --git a/examples/07_tutorials/KDD2020-tutorial/lightgcn.yaml b/examples/07_tutorials/KDD2020-tutorial/lightgcn.yaml
@@ -0,0 +1,22 @@
+#model
+model:
+    model_type : "lightgcn"
+    embed_size : 64 # the embedding dimension of users and items
+    n_layers : 3 # number of layers of the model
+
+#train
+train:
+    batch_size : 1024
+    decay : 0.0001 # l2 regularization for embedding parameters
+    epochs : 1000 # number of epochs for training
+    learning_rate : 0.001
+    eval_epoch : -1 # if it is not -1, evaluate the model every eval_epoch; -1 means that evaluation will not be performed during training
+    top_k : 20 # number of items to recommend when calculating evaluation metrics
+
+#show info
+#metric : "recall", "ndcg", "precision", "map"
+info:
+    save_model : True # whether to save model
+    save_epoch : 1 # if save_model is set to True, save the model every save_epoch
+    metrics : ["recall", "ndcg", "precision", "map"] # metrics for evaluation
+    MODEL_DIR : ./tests/resources/deeprec/lightgcn/model/lightgcn_model/ # directory of saved models