dc.contributor.author | Gurevych, Iryna |
dc.contributor.author | Habernal, Ivan |
dc.contributor.author | Zayed, Omnia |
dc.date.accessioned | 2017-06-07T13:08:21Z |
dc.date.available | 2017-06-07T13:08:21Z |
dc.date.issued | 2016-04-14 |
dc.identifier.uri | http://hdl.handle.net/11372/LRT-2206 |
dc.description | A large web corpus (over 10 billion tokens) licensed under CreativeCommons license family in 50+ languages that has been extracted from CommonCrawl, the largest publicly available general Web crawl to date with about 2 billion crawled URLs. |
dc.language.iso | afr |
dc.language.iso | ara |
dc.language.iso | ben |
dc.language.iso | bul |
dc.language.iso | ces |
dc.language.iso | dan |
dc.language.iso | deu |
dc.language.iso | ell |
dc.language.iso | eng |
dc.language.iso | est |
dc.language.iso | fas |
dc.language.iso | fin |
dc.language.iso | fra |
dc.language.iso | guj |
dc.language.iso | heb |
dc.language.iso | hin |
dc.language.iso | hrv |
dc.language.iso | hun |
dc.language.iso | ind |
dc.language.iso | ita |
dc.language.iso | jpn |
dc.language.iso | kor |
dc.language.iso | lav |
dc.language.iso | lit |
dc.language.iso | mal |
dc.language.iso | mar |
dc.language.iso | mkd |
dc.language.iso | nep |
dc.language.iso | nld |
dc.language.iso | nor |
dc.language.iso | pol |
dc.language.iso | por |
dc.language.iso | ron |
dc.language.iso | rus |
dc.language.iso | slk |
dc.language.iso | slv |
dc.language.iso | som |
dc.language.iso | spa |
dc.language.iso | sqi |
dc.language.iso | swa |
dc.language.iso | swe |
dc.language.iso | tam |
dc.language.iso | tel |
dc.language.iso | tgl |
dc.language.iso | tha |
dc.language.iso | tur |
dc.language.iso | ukr |
dc.language.iso | und |
dc.language.iso | urd |
dc.language.iso | vie |
dc.language.iso | zho |
dc.publisher | Technische Universität Darmstadt |
dc.relation.isreferencedby | http://www.lrec-conf.org/proceedings/lrec2016/pdf/388_Paper.pdf |
dc.rights | Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) |
dc.rights.uri | http://creativecommons.org/licenses/by-nc-sa/4.0/ |
dc.source.uri | https://dkpro.github.io/dkpro-c4corpus/ |
dc.subject | CommonCrawl |
dc.subject | Creative Commons |
dc.subject | Web corpus |
dc.subject | Amazon Web Services |
dc.title | C4Corpus (CC BY-NC-SA part) |
dc.type | corpus |
metashare.ResourceInfo#ContentInfo.mediaType | text |
dc.rights.label | PUB |
has.files | yes |
branding | LRT + Open Submissions |
contact.person | Ivan Habernal habernal@ukp.informatik.tu-darmstadt.de Technische Universität Darmstadt |
sponsor | German Research Foundation (DFG) DIP DA 1600/1-1 Information Consolidation: A New Paradigm in Knowledge Search nationalFunds |
sponsor | Amazon Amazon Web Services in Education Grant Web Services in Education Grant Other |
size.info | 10000000000 tokens |
files.size | 4490768392 |
files.count | 55 |
Soubory tohoto záznamu
Licenční kategorie:
Licence: Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)
Publicly Available
Licence: Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)
- Název
- Lic_by-nc-sa_Lang_af_NoBoilerplate_true_MinHtml_true-r-00009.seg-00000.warc.gz
- Velikost
- 219.02 KB
- Formát
- application/x-gzip
- MD5
- ed3a554b3a6b14dfae309abd4c453a97
- Název
- Lic_by-nc-sa_Lang_ar_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 3.68 MB
- Formát
- application/x-gzip
- MD5
- d5c8231d6a018bad56e151d894890eb0
- Název
- Lic_by-nc-sa_Lang_bg_NoBoilerplate_true_MinHtml_true-r-00010.seg-00000.warc.gz
- Velikost
- 2.77 MB
- Formát
- application/x-gzip
- MD5
- 562cbdfcd85bbba377dff90dc30b8e51
- Název
- Lic_by-nc-sa_Lang_bn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 672.63 KB
- Formát
- application/x-gzip
- MD5
- 0626c8a35d615cf19738a4bf5e6bb6b4
- Název
- Lic_by-nc-sa_Lang_cs_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Velikost
- 3.75 MB
- Formát
- application/x-gzip
- MD5
- 1674778a8de65767012940a3d05a31ac
- Název
- Lic_by-nc-sa_Lang_da_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 1.37 MB
- Formát
- application/x-gzip
- MD5
- e646885bcfdd6ce9068092ac0e49382f
- Název
- Lic_by-nc-sa_Lang_de_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Velikost
- 78.23 MB
- Formát
- application/x-gzip
- MD5
- e48b47fdfcc3323174fa040c7186dd44
- Název
- Lic_by-nc-sa_Lang_el_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 5.38 MB
- Formát
- application/x-gzip
- MD5
- d96b520832e35b38e8852f04e2e727ce
- Název
- Lic_by-nc-sa_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 953.73 MB
- Formát
- application/x-gzip
- MD5
- 170673218617c0593efa5791fb17d61e
- Název
- Lic_by-nc-sa_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00001.warc.gz
- Velikost
- 953.72 MB
- Formát
- application/x-gzip
- MD5
- 2b2652509761bd3425c59bbb5464f471
- Název
- Lic_by-nc-sa_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00002.warc.gz
- Velikost
- 953.7 MB
- Formát
- application/x-gzip
- MD5
- 7535bdec1e89e9d0946a73553c37a7e8
- Název
- Lic_by-nc-sa_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00003.warc.gz
- Velikost
- 620.92 MB
- Formát
- application/x-gzip
- MD5
- 572c21745326497df619f49a6b2d22d0
- Název
- Lic_by-nc-sa_Lang_es_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Velikost
- 339.18 MB
- Formát
- application/x-gzip
- MD5
- a64694925a6c0b61a922b087294ac405
- Název
- Lic_by-nc-sa_Lang_et_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 400.29 KB
- Formát
- application/x-gzip
- MD5
- 76b85b21fa7ccd9856874b71c1d2d5fc
- Název
- Lic_by-nc-sa_Lang_fa_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 465.35 KB
- Formát
- application/x-gzip
- MD5
- eb6d334f5f0524dadf77e34686211573
- Název
- Lic_by-nc-sa_Lang_fi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Velikost
- 4.01 MB
- Formát
- application/x-gzip
- MD5
- 6f78e54a5383c643ade8e680e370bb36
- Název
- Lic_by-nc-sa_Lang_fr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 68.97 MB
- Formát
- application/x-gzip
- MD5
- 0dc57d825fc85a98671a5e4d1c856ef5
- Název
- Lic_by-nc-sa_Lang_gu_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Velikost
- 4.72 KB
- Formát
- application/x-gzip
- MD5
- 1cf9aac410350025ed065e2b6df7034d
- Název
- Lic_by-nc-sa_Lang_he_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Velikost
- 739.01 KB
- Formát
- application/x-gzip
- MD5
- 8f5f1b46e012a6a6a7511b02ce90de1e
- Název
- Lic_by-nc-sa_Lang_hi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Velikost
- 90.93 KB
- Formát
- application/x-gzip
- MD5
- 89b4203315f9b0a0893aaa87b69a499c
- Název
- Lic_by-nc-sa_Lang_hr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 3.68 MB
- Formát
- application/x-gzip
- MD5
- 8c21113882d3e312887c253cba7ce723
- Název
- Lic_by-nc-sa_Lang_hu_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Velikost
- 19.17 MB
- Formát
- application/x-gzip
- MD5
- cc99760ea47e410217d8d71d64e0cb0c
- Název
- Lic_by-nc-sa_Lang_id_NoBoilerplate_true_MinHtml_true-r-00007.seg-00000.warc.gz
- Velikost
- 8.16 MB
- Formát
- application/x-gzip
- MD5
- ff847175baaac97ca6a2bddd7f048dfb
- Název
- Lic_by-nc-sa_Lang_it_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 97.81 MB
- Formát
- application/x-gzip
- MD5
- 78bd57f7dca286ca1c1029c58c9dfddd
- Název
- Lic_by-nc-sa_Lang_ja_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 7.32 MB
- Formát
- application/x-gzip
- MD5
- ab8cbb6da8a0d7062602521edc6574dc
- Název
- Lic_by-nc-sa_Lang_ko_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 4.49 MB
- Formát
- application/x-gzip
- MD5
- c8513ce2b920b6a1b3fcc5307d9ee786
- Název
- Lic_by-nc-sa_Lang_lt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 406.21 KB
- Formát
- application/x-gzip
- MD5
- 3d1ac82e752bcf144878657973684076
- Název
- Lic_by-nc-sa_Lang_lv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Velikost
- 133.47 KB
- Formát
- application/x-gzip
- MD5
- e89b87839dde076d94a12ea9f2a8ba75
- Název
- Lic_by-nc-sa_Lang_mk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Velikost
- 230.21 KB
- Formát
- application/x-gzip
- MD5
- ba25d75abf8cc043f20607523b4bfcf3
- Název
- Lic_by-nc-sa_Lang_ml_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 13.89 KB
- Formát
- application/x-gzip
- MD5
- 85f797f0060a9a8d64d2d2ac029be12f
- Název
- Lic_by-nc-sa_Lang_mr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 29.7 KB
- Formát
- application/x-gzip
- MD5
- 73d15bcaa2662ddb8ec7a62598b59c34
- Název
- Lic_by-nc-sa_Lang_ne_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Velikost
- 69.09 KB
- Formát
- application/x-gzip
- MD5
- 21846c80539acb684c044b24e1c0f797
- Název
- Lic_by-nc-sa_Lang_nl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 14.94 MB
- Formát
- application/x-gzip
- MD5
- 70777ed1441ee3b16845784b714d45e9
- Název
- Lic_by-nc-sa_Lang_no_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 1.85 MB
- Formát
- application/x-gzip
- MD5
- 747790b07c73ea989d910952d39ff761
- Název
- Lic_by-nc-sa_Lang_pl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 4.54 MB
- Formát
- application/x-gzip
- MD5
- e6d4b73096e78dc432c5a36f91a0d2e5
- Název
- Lic_by-nc-sa_Lang_pt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 63.91 MB
- Formát
- application/x-gzip
- MD5
- 8a001d21f0fba4617634067272b27b47
- Název
- Lic_by-nc-sa_Lang_ro_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 2.94 MB
- Formát
- application/x-gzip
- MD5
- a4ac5fac6eee395f4e62c8eedc285613
- Název
- Lic_by-nc-sa_Lang_ru_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Velikost
- 3.86 MB
- Formát
- application/x-gzip
- MD5
- eb8f15797d6d2816f47d224dcf441052
- Název
- Lic_by-nc-sa_Lang_sk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Velikost
- 932.24 KB
- Formát
- application/x-gzip
- MD5
- 48f9e3926ee6bde98bec1bd03b2f27a6
- Název
- Lic_by-nc-sa_Lang_sl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 684.6 KB
- Formát
- application/x-gzip
- MD5
- 174485195c2027c58573fe99165c159c
- Název
- Lic_by-nc-sa_Lang_so_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 84.04 KB
- Formát
- application/x-gzip
- MD5
- af14e23a3b70ada8e2a38d1b5f8cb4ce
- Název
- Lic_by-nc-sa_Lang_sq_NoBoilerplate_true_MinHtml_true-r-00020.seg-00000.warc.gz
- Velikost
- 91.79 KB
- Formát
- application/x-gzip
- MD5
- 81a1ee35454d5f0501cd9effb180bd2b
- Název
- Lic_by-nc-sa_Lang_sv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Velikost
- 7.92 MB
- Formát
- application/x-gzip
- MD5
- 90cbfbcaa66d1515a7a1e615c420284f
- Název
- Lic_by-nc-sa_Lang_sw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Velikost
- 415.88 KB
- Formát
- application/x-gzip
- MD5
- 4909fa06d4fcfbab46737bbe475f3e90
- Název
- Lic_by-nc-sa_Lang_ta_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 159.46 KB
- Formát
- application/x-gzip
- MD5
- ef9dfa69bc9e14e1f90741899c708df2
- Název
- Lic_by-nc-sa_Lang_te_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Velikost
- 79.96 KB
- Formát
- application/x-gzip
- MD5
- bc90d303a889aeec86cf315c2bf0352d
- Název
- Lic_by-nc-sa_Lang_th_NoBoilerplate_true_MinHtml_true-r-00011.seg-00000.warc.gz
- Velikost
- 14.52 MB
- Formát
- application/x-gzip
- MD5
- 7fc434963e8b0f39f8e98b81949b1f51
- Název
- Lic_by-nc-sa_Lang_tl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 138.33 KB
- Formát
- application/x-gzip
- MD5
- fc784dcdb157dd2c6f3a257d6e538f73
- Název
- Lic_by-nc-sa_Lang_tr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 3.31 MB
- Formát
- application/x-gzip
- MD5
- 6d9111906526e373c0f5dbdac0213878
- Název
- Lic_by-nc-sa_Lang_uk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Velikost
- 404.47 KB
- Formát
- application/x-gzip
- MD5
- 6c86ee6f17d0ea447c0b9b399eecfe88
- Název
- Lic_by-nc-sa_Lang_unknown_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 25.23 MB
- Formát
- application/x-gzip
- MD5
- c9841e1def2998bf733fcf074c80c50f
- Název
- Lic_by-nc-sa_Lang_ur_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 419.78 KB
- Formát
- application/x-gzip
- MD5
- a00d8fbdbaa04305a8629dd14db99b80
- Název
- Lic_by-nc-sa_Lang_vi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Velikost
- 1.9 MB
- Formát
- application/x-gzip
- MD5
- ec09c06dda6a758f9c667b8700710c5c
- Název
- Lic_by-nc-sa_Lang_zh-cn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 825.53 KB
- Formát
- application/x-gzip
- MD5
- 49a01fb570f4c8ed9cc1596d39e60064
- Název
- Lic_by-nc-sa_Lang_zh-tw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Velikost
- 263.11 KB
- Formát
- application/x-gzip
- MD5
- 502b750aabd0df199b7551eb3f62671f