Link Search Menu Expand Document

DrFact for OpenCSR

Installation

Basic dependency.

# Note that we use TF 1.15. This is because we use the tf.contrib package,
conda create --name drfact python=3.7
conda activate drfact
pip install tensorflow-gpu==1.15.2
conda install -c anaconda cudatoolkit=10.0
conda install -c anaconda cudnn # Make sure your cuda is 10.0 and cudnn is 7.6.5
pip install tensorflow-hub bert-tensorflow 
pip install gdown
# download https://github.com/google-research/language as a zip and unzip it.
pip install -e language-master
pip install tensorflow-determinism  # for being reproducible 
git clone https://github.com/google-research/albert.git # add albert
pip install -r albert/requirements.txt

Test if your gpus can be used.

import tensorflow as tf
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

Download the BERT files.

cd ~/ # can be  any place you want.
wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
unzip uncased_L-12_H-768_A-12.zip

Some common env vars used below.

BERT_PATH=~/uncased_L-12_H-768_A-12
CORPUS_PATH=drfact_data/knowledge_corpus/
INDEX_PATH=drfact_data/local_index/

Indexing the Hypergraphs and Fact Embeddings.

Corpus preprocessing.

python -m language.labs.drfact.index_corpus \
--do_preprocess \
--concept_vocab_file ${CORPUS_PATH}/gkb_best.vocab.txt \
--corpus_file ${CORPUS_PATH}/gkb_best.drfact_format.jsonl \
--max_entity_length 5 \
--max_mentions_per_doc 20 \
--tokenizer_type bert_tokenization \
--vocab_file ${BERT_PATH}/vocab.txt \
--index_result_path ${INDEX_PATH}/${INDEX_NAME} \
--alsologtostderr

Pre-compute the Fact2Fact index.

for (( c=0; c<=94; c++ ))   # assume you have 95 CPUs.
do
   python -m language.labs.drfact.fact2fact_index \
    --do_preprocess \
    --corpus_file ${CORPUS_PATH}/gkb_best.drfact_format.jsonl \
    --fact2fact_index_dir ${INDEX_PATH}/fact2fact_index \
    --num_shards 94 --my_shard $c \
    --alsologtostderr &
done

# Combine the single files into a single file.

python -m language.labs.drfact.fact2fact_index \
  --do_combine \
  --corpus_file ${CORPUS_PATH}/gkb_best.drfact_format.jsonl \
  --fact2fact_index_dir ${INDEX_PATH}/fact2fact_index \
  --num_shards 94 \
  --alsologtostderr

Convert Fact Embeddings

Note that you need to run the DPR training first.

declare -a datasets=("ARC" "OBQA" "QASC")
for DATA_NAME in "${datasets[@]}"
do
  python -m language.labs.drfact.convert_dpr_index \
  --index_result_path ${INDEX_PATH}/drfact_fact_index \
  --dpr_pkl_path drfact_data/${DATA_NAME}_dpr_index_0.pkl \
  --embed_prefix ${DATA_NAME}_dpr_bert_base 
done 

Generate distant supervision. (Optional)

Show/Hide
  declare -a datasets=("ARC" "OBQA" "QASC")
  declare -a splits=("train" "dev") 
  for DATA_NAME in "${datasets[@]}"
  do 
    for SPLIT in "${splits[@]}"
    do 
      python -m language.labs.drfact.preprocessing.decompose_concepts \
        --CONCEPT_VOCAB drfact_data/knowledge_corpus/gkb_best.vocab.txt \
        --jsonl_file drfact_data/datasets/${DATA_NAME}/${SPLIT}.jsonl \
        --linked_qas_file drfact_data/datasets/${DATA_NAME}/linked_${SPLIT}.jsonl &
    done
  done

  for DATA_NAME in "${datasets[@]}"
  do
    DATA_FOLDER=drfact_data/datasets/${DATA_NAME}
    RET=${DATA_NAME}_DPR
    for SPLIT in "${splits[@]}"
    do
      python -m language.labs.drfact.add_sup_facts \
        --linked_qas_file ${DATA_FOLDER}/linked_${SPLIT}.jsonl \
        --drfact_format_gkb_file  ${CORPUS_PATH}/gkb_best.drfact_format.jsonl \
        --ret_result_file ${DATA_FOLDER}/linked_${SPLIT}.${RET}.jsonl \
        --output_file ${DATA_FOLDER}/linked_${SPLIT}.sup_facts_from_${RET}.jsonl \
        --concept_vocab_file drfact_data/knowledge_corpus/gkb_best.vocab.txt \
        --max_num_facts 50 --split ${SPLIT} &
    done
  done

  for DATA_NAME in "${datasets[@]}"
  do
    RET=${DATA_NAME}_DPR
    DATA_FOLDER=drfact_data/datasets/${DATA_NAME}
    for SPLIT in "${splits[@]}"
    do
      python -m language.labs.drfact.add_middle_hops \
        --do hopping \
        --drfact_format_gkb_file ${CORPUS_PATH}/gkb_best.drfact_format.jsonl \
        --sup_fact_result_without_ans ${DATA_FOLDER}/linked_${SPLIT}.sup_facts_from_${RET}.jsonl \
        --sup_fact_result_with_ans ${DATA_FOLDER}/linked_${SPLIT}.sup_facts_from_${RET}_with_ans.jsonl \
        --output_file ${DATA_FOLDER}/linked_${SPLIT}.sup_facts_final_${RET}.jsonl &
    done
  done

Renew the F2F links with such distant supervision.

  echo "" > drfact_data/local_drfact_index/fact2fact_index/add_links.tsv
  declare -a datasets=("ARC" "OBQA" "QASC")
  declare -a splits=("train")
  RET=OCSR_DPR 
  for DATA_NAME in "${datasets[@]}"
  do
    DATA_FOLDER=drfact_data/datasets/${DATA_NAME}
    for SPLIT in "${splits[@]}"
    do
      SUP_FACT=${DATA_FOLDER}/linked_${SPLIT}.sup_facts_final_${RET}.jsonl
      python -m language.labs.drfact.extract_fact_links \
        --sup_facts_file ${SUP_FACT} \
        --fact_links_file drfact_data/local_drfact_index/fact2fact_index/add_links.tsv
    done
  done
  python language-master/language/labs/drfact/convert_add_links.py \
    drfact_data/local_drfact_index/fact2fact_index/add_links.tsv \
    drfact_data/local_drfact_index/fact2fact_index/f2f_95.json

Pre-compute the initial facts.

declare -a datasets=("ARC" "OBQA" "QASC") 
declare -a splits=("train" "dev")
COUNT=50
for DATA_NAME in "${datasets[@]}"
do
  DATA_FOLDER=drfact_data/datasets/${DATA_NAME}
  RET=${DATA_NAME}_DPR
  for SPLIT in "${splits[@]}"
  do
    SUP_FACT=${DATA_FOLDER}/linked_${SPLIT}.sup_facts_final_${RET}.jsonl  # optional
    python -m language.labs.drfact.add_init_facts \
      --linked_qas_file ${DATA_FOLDER}/linked_${SPLIT}.jsonl \
      --drfact_format_gkb_file  ${CORPUS_PATH}/gkb_best.drfact_format.jsonl \
      --ret_result_file ${DATA_FOLDER}/linked_${SPLIT}.${RET}.jsonl \
      --sup_facts_file ${SUP_FACT} \
      --output_file ${DATA_FOLDER}/linked_${SPLIT}.init_facts.jsonl \
      --max_num_facts ${COUNT} --split ${SPLIT} &
  done
done

Training

DATA=ARC
ODIR=~/saved_models/drfact_models_${DATA}
HOP=3 # can be any integer.
GPUS=0 OUT_DIR=${ODIR} DATASET=${DATA} bash scripts/run_drfact.sh train ${HOP}   # Training
GPUS=1 OUT_DIR=${ODIR} DATASET=${DATA} bash scripts/run_drfact.sh continual_eval ${HOP}  # Online Evaluation on Dev
# the log file will be at `${ODIR}/hop_${HOP}/tf_log.cont_eval.txt`

Inference

DATA=ARC
ODIR=~/saved_models/drfact_models_${DATA}
HOP=3
GPUS=0 OUT_DIR=${ODIR} DATASET=${DATA} bash scripts/run_drfact.sh \
    predict ${HOP} [checkpoint_name] [train|dev|test] 
# an example string for the checkpoint_name is "model.ckpt-14600"

The content of the run_drfact.sh

Show/Hide
  #!/bin/bash
  BERT_PATH=~/uncased_L-12_H-768_A-12  # BERT-base
  question_num_layers=11
  ENTAGG=max
  CORPUS_PATH=drfact_data/knowledge_corpus/
  INDEX_PATH=drfact_data/local_drfact_index/
  INDEX_NAME=drfact_output_bert200
  F2F_INDEX_NAME=fact2fact_index
  DATASET_PATH=drfact_data/datasets/${DATASET}
  NUM_HOPS=$2
  MODEL_OUTPUT_DIR=${OUT_DIR}/hop_$2
  PREDICT_PREFIX=dev
  if [ "$1" = "train" ]; 
  then
    echo "training mode"
    rm -r ${MODEL_OUTPUT_DIR}
    DO="do_train "
    mkdir -p ${MODEL_OUTPUT_DIR}
    LOG_FILE=${MODEL_OUTPUT_DIR}/tf_log.train.txt
  elif [ "$1" = "continual_eval" ];
  then
    echo "continual_eval mode"
    DO="do_predict "
    mkdir -p ${MODEL_OUTPUT_DIR}
    LOG_FILE=${MODEL_OUTPUT_DIR}/tf_log.cont_eval.txt
  elif [ "$1" = "predict" ];
  then
    echo "prediction mode"
    PREDICT_PREFIX=$4 # dev or train
    DO="do_predict --use_best_ckpt_for_predict --model_ckpt_toload $3 "
    LOG_FILE=${MODEL_OUTPUT_DIR}/tf_log.$3-${PREDICT_PREFIX}-prediction.txt
  fi

  touch ${LOG_FILE}
  cp language-master/language/labs/drfact/model_fns.py ${LOG_FILE}.model_fn.py

  CUDA_VISIBLE_DEVICES=${GPUS} python -m language.labs.drfact.run_drfact \
    --vocab_file ${BERT_PATH}/vocab.txt \
    --tokenizer_model_file None \
    --bert_config_file ${BERT_PATH}/bert_config.json \
    --tokenizer_type bert_tokenization \
    --output_dir ${MODEL_OUTPUT_DIR} \
    --train_file ${DATASET_PATH}/linked_train.init_facts.jsonl \
    --predict_file ${DATASET_PATH}/linked_${PREDICT_PREFIX}.init_facts.jsonl \
    --predict_prefix ${PREDICT_PREFIX} \
    --init_checkpoint ${BERT_PATH}/bert_model.ckpt \
    --train_data_dir ${INDEX_PATH}/${INDEX_NAME} \
    --test_data_dir ${INDEX_PATH}/${INDEX_NAME} \
    --f2f_index_dir ${INDEX_PATH}/${F2F_INDEX_NAME} \
    --learning_rate 3e-05 \
    --warmup_proportion 0.1 \
    --train_batch_size 24 \
    --predict_batch_size 1 \
    --save_checkpoints_steps 100 \
    --iterations_per_loop 300 \
    --num_train_epochs 10.0 \
    --max_query_length 128 \
    --max_entity_len 5 \
    --qry_layers_to_use -1 \
    --qry_aggregation_fn concat \
    --question_dropout 0.3 \
    --question_num_layers ${question_num_layers} \
    --projection_dim 384 \
    --train_with_sparse  \
    --fix_sparse_to_one  \
    --predict_with_sparse  \
    --data_type opencsr \
    --model_type drfact \
    --supervision fact+entity \
    --num_mips_neighbors 500 \
    --entity_score_aggregation_fn ${ENTAGG} \
    --entity_score_threshold 1e-4 \
    --fact_score_threshold 1e-5 \
    --softmax_temperature 5.0 \
    --sparse_reduce_fn max \
    --sparse_strategy sparse_first \
    --num_hops ${NUM_HOPS} \
    --num_preds -1 \
    --embed_index_prefix ${DATASET}_dpr_bert_base \
    --$DO 2> ${LOG_FILE} &

  echo " "
  echo ${LOG_FILE}

  # watch -n 1 tail -n 50 ${LOG_FILE}