B36 Tórshavn vs. HB Tórshavn
This fixture is not just a battle between two top teams but also a test of resilience and strategy. B36 Tórshavn has been known for their disciplined defense, while HB Tórshavn prides itself on quick transitions from defense to attack.
Tactical Breakdown
- B36 Tórshavn: Expect a compact midfield with emphasis on ball retention. Their full-backs might push forward to support attacks.
- HB Tórshavn: Likely to exploit spaces left by B36’s attacking full-backs with pacey wingers.
Potential Game Changers
- Sara Bjørnsgaard: Her ability to find space in tight areas could be decisive.
- Anja Rasmussen: Known for her tackling prowess, she could disrupt HB’s rhythm.
Betting Angle
The draw seems like a safe bet given both teams' defensive strengths. However, those looking for a riskier option might consider backing Sara Bjørnsgaard to score anytime at odds of 2.80.
<|repo_name|>zzyyuuu/CS5227<|file_sep|>/src/evaluation.py import json import os from collections import Counter import math def read_data(data_path): train = [] test = [] for line in open(data_path): train.append(json.loads(line)) return train def evaluate(data_path): print("Evaluating...") train = read_data(data_path) print("Finish reading data.") true_pos = [] false_pos = [] false_neg = [] for sample in train: query = sample['query'] doc_id = sample['doc_id'] label = sample['label'] if label == '0': continue relevant_docs = [doc['doc_id'] for doc in query['relevant_docs']] retrieved_docs = [doc['doc_id'] for doc in query['retrieved_docs']] if doc_id not in retrieved_docs: false_neg.append(doc_id) continue if doc_id not in relevant_docs: false_pos.append(doc_id) continue true_pos.append(doc_id) print("True positive: {}".format(len(true_pos))) print("False positive: {}".format(len(false_pos))) print("False negative: {}".format(len(false_neg))) pre = len(true_pos) / float(len(true_pos) + len(false_pos)) rec = len(true_pos) / float(len(true_pos) + len(false_neg)) f1_score = (2 * pre * rec) / (pre + rec) print("Precision: {}".format(pre)) print("Recall: {}".format(rec)) print("F1 score: {}".format(f1_score)) if __name__ == "__main__": data_path = os.path.join(os.path.dirname(__file__), "data/evaluation.json") evaluate(data_path)<|file_sep|># CS5227 Natural Language Processing ## Task ### Assignment #1 To implement word embeddings with Word2Vec algorithm. ### Assignment #2 To implement retrieval model with TF-IDF algorithm. ### Assignment #3 To implement retrieval model with BM25 algorithm. ### Assignment #4 To implement question answering system with Elasticsearch. ### Assignment #5 To implement text summarization with extractive approach. ## Environment setup To install required libraries: pip install -r requirements.txt ## Usage ### Assignment #1 #### Preprocessing training data python src/preprocess.py --data_dir data/train_data --output_dir output/preprocessed_train_data --min_count=5 --max_vocab_size=100000 #### Training Word2Vec model python src/word2vec.py --train_dir output/preprocessed_train_data --model_dir output/word2vec_model --vector_size=200 --window_size=5 --num_iter=5 #### Using Word2Vec model python src/word2vec.py --model_dir output/word2vec_model --use_model ### Assignment #2 & Assignment #3 #### Preprocessing training data & building index file python src/preprocess.py --data_dir data/train_data --output_dir output/preprocessed_train_data --min_count=5 --max_vocab_size=100000 python src/tfidf_bm25.py --train_dir output/preprocessed_train_data --index_dir output/index_file #### Evaluation & testing python src/tfidf_bm25.py --index_dir output/index_file python src/tfidf_bm25.py --test_dir data/test_data --index_dir output/index_file python src/tfidf_bm25.py --eval_dir data/evaluation.json --index_dir output/index_file ### Assignment #4 #### Indexing document corpus using Elasticsearch API python src/es_index.py -d ./data/collection.json -i http://localhost:9200/ #### Testing question answering system using Elasticsearch API python src/es_search.py -i http://localhost:9200/ ### Assignment #5 #### Generating summary using TextRank algorithm python src/textrank.py -i data/sample.txt -o data/summary.txt <|file_sep|># -*- coding: utf-8 -*- import json import os import re import sys import string from collections import Counter def load_json(filename): with open(filename) as f: return json.load(f) def save_json(filename, data): with open(filename, 'w') as f: json.dump(data, f) def preprocess(filename): data = load_json(filename) patterns = [ r"[d]+", r".", r",", r"!", r"?", r";", r":", r"-", r""", r"'", r"&", r"/", r"\", r"(", r")", r"[", r"]", r"{", r"}", r"<[^<>]*>", r"[^x00-x7f]", string.punctuation, r"(http|ftp)s?://.*[rn]*", ] for i_docid in range(len(data)): doc = data[i_docid] doc_text = doc['text'] for pattern in patterns: doc_text = re.sub(pattern," ", doc_text) doc_tokens = doc_text.split() doc_tokens_filtered = [] for token in doc_tokens: if token.isalpha(): doc_tokens_filtered.append(token.lower()) doc['tokens'] = doc_tokens_filtered save_json(filename[:-5] + "_tokenized.json", data) if __name__ == '__main__': for filename in sys.argv[1:]: preprocess(filename)<|repo_name|>zzyyuuu/CS5227<|file_sep|>/src/es_search.py # -*- coding: utf-8 -*- from elasticsearch import Elasticsearch import argparse class Searcher(object): def __init__(self): self.es_client = None def connect(self): parser = argparse.ArgumentParser() parser.add_argument('-i', '--host', help='Elasticsearch host', default='http://localhost:9200/') args = parser.parse_args() self.es_client = Elasticsearch(args.host) def search(self): while True: query_str = input('Enter your query (or type "quit" or "exit"): ') if query_str == 'quit' or query_str == 'exit': break # Query type: phrase match query body_query_phrase_match_query_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1 = { "query": { "match_phrase": { "text": query_str } } } body_query_phrase_match_query_2_2_2_2_2_2_2_2_2_2_2_2_2_2_2_2_2_2_2_2_2 = { "query": { "match_phrase": { "tokens": query_str.split() } } } body_query_phrase_match_query_fusion_with_relevance_score_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_fusion_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weight_of_text_and_tokens_based_on_field_length_norm_factor_of_field_length_ratio_of_text_to_tokens_fields_len_weighted_fusion_query_type_is_simple_fusion_score_type_is_sum_fusion_method_is_simple_fusion_score_type_is_sum_fusion_method_is_simple_fusion_score_type_is_sum_fusion_method_is_simple_fusion_score_type_is_sum_fusion_method_is_simple_fusion_score_type_is_sum_fusion_method_is_simple_fusion_score_type_is_sum_fusion_method_is_simple_fusion_score_type_is_sum" : { "bool": { "should": [ {"match_phrase": {"text": query_str}}, {"match_phrase": {"tokens": query_str.split()}} ] } } } body_query_boolean_should_query_with_two_clauses_where_one_clause_is_a_phrase_match_query_for_the_entire_query_string_over_the_raw_document_content_body_without_tokenization_stemming_or_stopword_removal_or_any_other_preprocessing_step_the_other_clause_is_a_phrase_match_query_for_each_token_in_the_query_string_over_the_preprocessed_document_content_body_with_tokenization_stemming_stopword_removal_or_any_other_preprocessing_step_that_might_have_been_applied" : { "query": { "bool": { "should": [ {"match_phrase": {"text": query_str}}, {"match_phrase": {"tokens": query_str.split()}} ] } } } body_query_boolean_should_query_with_two_clauses_where_one_clause_is_a_phrase_match_query_for_the_entire_query_string_over_the_raw_document_content_body_without_tokenization_stemming_or_stopword_removal_or_any_other_preprocessing_step_the_other_clause_is_a_boolean_should_query_with_multiple_clauses_where_each_clause_represents_a_single_token_from_the_query_string_over_the_preprocessed_document_content_body_with_tokenization_stemming_stopword_removal_or_any_other_preprocessing_step_that_might_have_been_applied" : { "query": { "bool": { "should": [ {"match_phrase": {"text": query_str}}, {"bool":{ "should":[] }} ] } } } body_query_boolean_should_query_with_two_clauses_where_one_clause_is_a_phrase_match_query_for_the_entire_query_string_over_the_raw_document_content_body_without_tokenization_stemming_or_stopword_removal_or_any_other_preprocessing_step_the_other_clause_is_a_boolean_should_query_with_multiple_clauses_where_each_clause_represents_a_single_token_from_the_query_string_over_the_preprocessed_document_content_body_with_tokenization_stemming_stopword_removal_or_any_other_preprocessing_step_that_might_have_been_applied_using_inverse_document_frequency_weighting_for_term_importance" : { "query": { "bool": { "should": [ {"match_phrase": {"text": query_str}},