From 254aaf518afba46fac60aaa22f718afd3a3bd249 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E5=B5=A9?= Date: Wed, 25 Jul 2018 22:23:24 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=B8=80=E4=B8=AA=E9=98=9F?= =?UTF-8?q?=E5=88=97=E9=87=8D=E5=A4=8D=E7=9A=84=E9=97=AE=E9=A2=98=20?= =?UTF-8?q?=E5=BC=80=E5=A7=8B=E5=AE=9E=E7=8E=B0=E5=AE=8C=E5=85=A8=E5=88=86?= =?UTF-8?q?=E5=B8=83=E5=BC=8F=E6=A1=86=E6=9E=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DotnetSpider.sln | 12 +++ .../DotnetSpider.Broker.csproj | 12 +++ src/DotnetSpider.Broker/Program.cs | 30 +++++++ .../DotnetSpider.Core.csproj | 2 +- .../Pipeline/BasePipeline.cs | 2 +- src/DotnetSpider.Core/Spider.cs | 2 +- .../ConfigurableSpider.cs | 23 +++++ .../DotnetSpider.Extension.csproj | 2 +- .../Pipeline/DbModelPipeline.cs | 2 +- src/DotnetSpider.Extension/spider.json | 88 +++++++++++++++++++ .../DotnetSpider.Extraction.csproj | 2 +- .../Model/Attribute/FieldSelector.cs | 3 +- .../Model/Attribute/SharedValueSelector.cs | 3 +- .../Model/Attribute/TableInfo.cs | 6 +- .../Model/Attribute/TargetRequestSelector.cs | 6 +- .../Model/Attribute/ToNext.cs | 6 +- src/DotnetSpider.Extraction/Model/DataType.cs | 5 +- .../Model/FieldOptions.cs | 5 +- .../Model/ModelDefinition.cs | 2 + src/DotnetSpider.Extraction/Model/Selector.cs | 8 +- .../Model/TableNamePostfix.cs | 6 +- src/DotnetSpider.Extraction/SelectorType.cs | 5 +- .../DotnetSpider.Node.csproj | 10 +++ src/DotnetSpider.Node/Program.cs | 15 ++++ src/DotnetSpider.Sample/Program.cs | 4 +- src/DotnetSpider.Sample/docs/ModelSpider.cs | 3 +- 26 files changed, 246 insertions(+), 18 deletions(-) create mode 100644 src/DotnetSpider.Broker/DotnetSpider.Broker.csproj create mode 100644 src/DotnetSpider.Broker/Program.cs create mode 100644 src/DotnetSpider.Extension/ConfigurableSpider.cs create mode 100644 src/DotnetSpider.Extension/spider.json create mode 100644 src/DotnetSpider.Node/DotnetSpider.Node.csproj create mode 100644 src/DotnetSpider.Node/Program.cs diff --git a/DotnetSpider.sln b/DotnetSpider.sln index 3918eac90..8f8a23f08 100644 --- a/DotnetSpider.sln +++ b/DotnetSpider.sln @@ -40,6 +40,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution runtests.sh = runtests.sh EndProjectSection EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DotnetSpider.Node", "src\DotnetSpider.Node\DotnetSpider.Node.csproj", "{C2BAD1A6-6744-4927-B014-67647D3FAD58}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DotnetSpider.Broker", "src\DotnetSpider.Broker\DotnetSpider.Broker.csproj", "{93099A1A-128B-4023-9271-F535A11F2490}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -98,6 +102,14 @@ Global {372C7A6F-E1EB-4AA6-8B31-1AE52FBDAA83}.Debug|Any CPU.Build.0 = Debug|Any CPU {372C7A6F-E1EB-4AA6-8B31-1AE52FBDAA83}.Release|Any CPU.ActiveCfg = Release|Any CPU {372C7A6F-E1EB-4AA6-8B31-1AE52FBDAA83}.Release|Any CPU.Build.0 = Release|Any CPU + {C2BAD1A6-6744-4927-B014-67647D3FAD58}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C2BAD1A6-6744-4927-B014-67647D3FAD58}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C2BAD1A6-6744-4927-B014-67647D3FAD58}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C2BAD1A6-6744-4927-B014-67647D3FAD58}.Release|Any CPU.Build.0 = Release|Any CPU + {93099A1A-128B-4023-9271-F535A11F2490}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {93099A1A-128B-4023-9271-F535A11F2490}.Debug|Any CPU.Build.0 = Debug|Any CPU + {93099A1A-128B-4023-9271-F535A11F2490}.Release|Any CPU.ActiveCfg = Release|Any CPU + {93099A1A-128B-4023-9271-F535A11F2490}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/src/DotnetSpider.Broker/DotnetSpider.Broker.csproj b/src/DotnetSpider.Broker/DotnetSpider.Broker.csproj new file mode 100644 index 000000000..6305c3f9e --- /dev/null +++ b/src/DotnetSpider.Broker/DotnetSpider.Broker.csproj @@ -0,0 +1,12 @@ + + + + Exe + netcoreapp2.1 + + + + + + + diff --git a/src/DotnetSpider.Broker/Program.cs b/src/DotnetSpider.Broker/Program.cs new file mode 100644 index 000000000..15bb3c841 --- /dev/null +++ b/src/DotnetSpider.Broker/Program.cs @@ -0,0 +1,30 @@ +using Confluent.Kafka; +using Confluent.Kafka.Serialization; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Text; + +namespace DotnetSpider.Broker +{ + class Program + { + static void Main(string[] args) + { + var config = new Dictionary + { + { "bootstrap.servers", "192.168.90.106:9092" } + }; + + using (var producer = new Producer(config, null, new StringSerializer(Encoding.UTF8))) + { + for (int i = 0; i < 1000; ++i) + { + var dr = producer.ProduceAsync("my-topic", null, "test message text").Result; + Console.WriteLine($"Delivered '{dr.Value}' to: {dr.TopicPartitionOffset}"); + } + } + Console.Read(); + } + } +} diff --git a/src/DotnetSpider.Core/DotnetSpider.Core.csproj b/src/DotnetSpider.Core/DotnetSpider.Core.csproj index 080789480..121837f42 100644 --- a/src/DotnetSpider.Core/DotnetSpider.Core.csproj +++ b/src/DotnetSpider.Core/DotnetSpider.Core.csproj @@ -3,7 +3,7 @@ net40;net45;netstandard2.0 true true - 3.0.1 + 3.0.2 zlzforever@163.com; DotnetSpider.Core Copyright 2018 Lewis Zou diff --git a/src/DotnetSpider.Core/Pipeline/BasePipeline.cs b/src/DotnetSpider.Core/Pipeline/BasePipeline.cs index f40d6b2fc..6c52a8d81 100644 --- a/src/DotnetSpider.Core/Pipeline/BasePipeline.cs +++ b/src/DotnetSpider.Core/Pipeline/BasePipeline.cs @@ -6,7 +6,7 @@ namespace DotnetSpider.Core.Pipeline /// /// 数据管道抽象, 通过数据管道把解析的数据存到不同的存储中(文件、数据库) /// - public abstract class BasePipeline : IPipeline + public abstract class BasePipeline : Named, IPipeline { /// /// 处理页面解析器解析到的数据结果 diff --git a/src/DotnetSpider.Core/Spider.cs b/src/DotnetSpider.Core/Spider.cs index 28876a251..40c467a69 100644 --- a/src/DotnetSpider.Core/Spider.cs +++ b/src/DotnetSpider.Core/Spider.cs @@ -144,7 +144,7 @@ protected void VerifyDataOrGenerateReport(string[] arguments) public Site Site { get => _site; - protected set { _site = value ?? throw new ArgumentException($"{nameof(Site)} should not be null."); } + set { _site = value ?? throw new ArgumentException($"{nameof(Site)} should not be null."); } } /// diff --git a/src/DotnetSpider.Extension/ConfigurableSpider.cs b/src/DotnetSpider.Extension/ConfigurableSpider.cs new file mode 100644 index 000000000..29dd8fb90 --- /dev/null +++ b/src/DotnetSpider.Extension/ConfigurableSpider.cs @@ -0,0 +1,23 @@ +using DotnetSpider.Core; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace DotnetSpider.Extension +{ + public class ConfigurableSpider : Spider + { + private readonly string _json; + + public ConfigurableSpider(string json) + { + _json = json; + } + + protected override void OnInit(params string[] arguments) + { + + } + } +} diff --git a/src/DotnetSpider.Extension/DotnetSpider.Extension.csproj b/src/DotnetSpider.Extension/DotnetSpider.Extension.csproj index 7e6826d71..fb114f1fb 100644 --- a/src/DotnetSpider.Extension/DotnetSpider.Extension.csproj +++ b/src/DotnetSpider.Extension/DotnetSpider.Extension.csproj @@ -3,7 +3,7 @@ net40;net45;netstandard2.0 true true - 3.0.1 + 3.0.2 zlzforever@163.com; DotnetSpider.Extension Copyright 2018 Lewis Zou diff --git a/src/DotnetSpider.Extension/Pipeline/DbModelPipeline.cs b/src/DotnetSpider.Extension/Pipeline/DbModelPipeline.cs index 6e5098056..bd519b057 100644 --- a/src/DotnetSpider.Extension/Pipeline/DbModelPipeline.cs +++ b/src/DotnetSpider.Extension/Pipeline/DbModelPipeline.cs @@ -22,7 +22,7 @@ public abstract class DbModelPipeline : ModelPipeline public int RetryTimes { get; set; } = 600; - public string ConnectString { get; private set; } + public string ConnectString { get; set; } /// /// 数据库忽略大小写 diff --git a/src/DotnetSpider.Extension/spider.json b/src/DotnetSpider.Extension/spider.json new file mode 100644 index 000000000..7a6d69c7f --- /dev/null +++ b/src/DotnetSpider.Extension/spider.json @@ -0,0 +1,88 @@ +{ + "Model": { + "Selector": { + "Type": "XPath", + "Expression": "//div[@class='yk-pack pack-film']", + "Arguments": null + }, + "Take": 0, + "TakeFromHead": true, + "Table": { + "Database": "youku", + "Name": "show", + "Postfix": "Today", + "UpdateColumns": null, + "Indexs": null, + "Uniques": null, + "FullName": "show_2018_07_25" + }, + "Fields": [ + { + "NotNull": false, + "Option": "None", + "Length": 255, + "Name": "name", + "IgnoreStore": false, + "DataType": "String", + "IsPrimary": false, + "Formatters": null, + "Type": "XPath", + "Expression": ".//img[@class='quic']/@alt", + "Arguments": null + }, + { + "NotNull": false, + "Option": "None", + "Length": 255, + "Name": "index", + "IgnoreStore": false, + "DataType": "Int", + "IsPrimary": false, + "Formatters": null, + "Type": "Enviroment", + "Expression": "index", + "Arguments": null + }, + { + "NotNull": false, + "Option": "None", + "Length": 255, + "Name": "id", + "IgnoreStore": false, + "DataType": "Int", + "IsPrimary": true, + "Formatters": null, + "Type": "Enviroment", + "Expression": "", + "Arguments": null + } + ], + "TargetRequestSelectors": [ + { + "XPaths": [ "//ul[@class='yk-pages']" ], + "Patterns": [ "(http|ftp|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&:/~\\+#]*[\\w\\-\\@?^=%&/~\\+#])?" ] + } + ], + "SharedValueSelectors": null + }, + "Scheduler": { + "Name": "QueueDuplicateRemovedScheduler" + }, + "Downloader": { + "Name": "HttpClientDownloader", + "AllowAutoRedirect": true + }, + "Pipeline": { + "Name": "MySqlEntityPipeline", + "ConnectString": "Database='mysql';Data Source=localhost;password=;User ID=root;Port=3306;SslMode=None" + }, + "ClearSchedulerAfterCompleted": true, + "StatusFlushInterval": 5000, + "PipelineRetryTimes": 2, + "PipelineCachedSize": 5, + "RedialExecutor": "MutexRedialExecutor", + "EmptySleepTime": 15000, + "ExitWhenComplete": true, + "ThreadNum": 1, + "SkipTargetRequestsWhenResultIsEmpty": true +} diff --git a/src/DotnetSpider.Extraction/DotnetSpider.Extraction.csproj b/src/DotnetSpider.Extraction/DotnetSpider.Extraction.csproj index 3ae7755ba..cb7040db3 100644 --- a/src/DotnetSpider.Extraction/DotnetSpider.Extraction.csproj +++ b/src/DotnetSpider.Extraction/DotnetSpider.Extraction.csproj @@ -3,7 +3,7 @@ net40;net45;netstandard2.0 true true - 3.0.0 + 3.0.2 zlzforever@163.com; DotnetSpider.Extraction Copyright 2018 Lewis Zou diff --git a/src/DotnetSpider.Extraction/Model/Attribute/FieldSelector.cs b/src/DotnetSpider.Extraction/Model/Attribute/FieldSelector.cs index e9663a09f..d15d8d12c 100644 --- a/src/DotnetSpider.Extraction/Model/Attribute/FieldSelector.cs +++ b/src/DotnetSpider.Extraction/Model/Attribute/FieldSelector.cs @@ -1,4 +1,5 @@ -using System; +using Newtonsoft.Json; +using System; namespace DotnetSpider.Extraction.Model.Attribute { diff --git a/src/DotnetSpider.Extraction/Model/Attribute/SharedValueSelector.cs b/src/DotnetSpider.Extraction/Model/Attribute/SharedValueSelector.cs index 0a51530c8..e14c0de96 100644 --- a/src/DotnetSpider.Extraction/Model/Attribute/SharedValueSelector.cs +++ b/src/DotnetSpider.Extraction/Model/Attribute/SharedValueSelector.cs @@ -1,4 +1,5 @@ -using System; +using Newtonsoft.Json; +using System; namespace DotnetSpider.Extraction.Model.Attribute { diff --git a/src/DotnetSpider.Extraction/Model/Attribute/TableInfo.cs b/src/DotnetSpider.Extraction/Model/Attribute/TableInfo.cs index 1bdac794c..aecbffa70 100644 --- a/src/DotnetSpider.Extraction/Model/Attribute/TableInfo.cs +++ b/src/DotnetSpider.Extraction/Model/Attribute/TableInfo.cs @@ -1,4 +1,5 @@ -using System; +using Newtonsoft.Json; +using System; namespace DotnetSpider.Extraction.Model.Attribute { @@ -10,6 +11,9 @@ public class TableInfo : System.Attribute { private string _name; + [JsonIgnore] + public override object TypeId => base.TypeId; + /// /// 数据库名 /// diff --git a/src/DotnetSpider.Extraction/Model/Attribute/TargetRequestSelector.cs b/src/DotnetSpider.Extraction/Model/Attribute/TargetRequestSelector.cs index c8d392efd..405d5334a 100644 --- a/src/DotnetSpider.Extraction/Model/Attribute/TargetRequestSelector.cs +++ b/src/DotnetSpider.Extraction/Model/Attribute/TargetRequestSelector.cs @@ -1,4 +1,5 @@ -using System; +using Newtonsoft.Json; +using System; namespace DotnetSpider.Extraction.Model.Attribute { @@ -8,6 +9,9 @@ namespace DotnetSpider.Extraction.Model.Attribute [AttributeUsage(AttributeTargets.Class, AllowMultiple = true)] public class TargetRequestSelector : System.Attribute { + [JsonIgnore] + public override object TypeId => base.TypeId; + public TargetRequestSelector() { } public TargetRequestSelector(string[] xpaths, string[] patterns = null) diff --git a/src/DotnetSpider.Extraction/Model/Attribute/ToNext.cs b/src/DotnetSpider.Extraction/Model/Attribute/ToNext.cs index 9420c1b4a..2efcdf89a 100644 --- a/src/DotnetSpider.Extraction/Model/Attribute/ToNext.cs +++ b/src/DotnetSpider.Extraction/Model/Attribute/ToNext.cs @@ -1,4 +1,5 @@ -using System; +using Newtonsoft.Json; +using System; namespace DotnetSpider.Extraction.Model.Attribute { @@ -8,6 +9,9 @@ namespace DotnetSpider.Extraction.Model.Attribute [AttributeUsage(AttributeTargets.Property, AllowMultiple = true)] public class ToNext : System.Attribute { + [JsonIgnore] + public override object TypeId => base.TypeId; + /// /// 保存到起始链接的额外信息 /// diff --git a/src/DotnetSpider.Extraction/Model/DataType.cs b/src/DotnetSpider.Extraction/Model/DataType.cs index eaf1068c4..c0970c533 100644 --- a/src/DotnetSpider.Extraction/Model/DataType.cs +++ b/src/DotnetSpider.Extraction/Model/DataType.cs @@ -1,10 +1,13 @@ -using System; +using Newtonsoft.Json; +using Newtonsoft.Json.Converters; +using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace DotnetSpider.Extraction.Model { + [JsonConverter(typeof(StringEnumConverter))] public enum DataType { None, diff --git a/src/DotnetSpider.Extraction/Model/FieldOptions.cs b/src/DotnetSpider.Extraction/Model/FieldOptions.cs index 899f463a8..4c5b97216 100644 --- a/src/DotnetSpider.Extraction/Model/FieldOptions.cs +++ b/src/DotnetSpider.Extraction/Model/FieldOptions.cs @@ -1,4 +1,6 @@ -using System; +using Newtonsoft.Json; +using Newtonsoft.Json.Converters; +using System; using System.Collections.Generic; using System.Linq; using System.Text; @@ -8,6 +10,7 @@ namespace DotnetSpider.Extraction.Model /// /// 额外选项的定义 /// + [JsonConverter(typeof(StringEnumConverter))] public enum FieldOptions { /// diff --git a/src/DotnetSpider.Extraction/Model/ModelDefinition.cs b/src/DotnetSpider.Extraction/Model/ModelDefinition.cs index 197477c61..eaf04319e 100644 --- a/src/DotnetSpider.Extraction/Model/ModelDefinition.cs +++ b/src/DotnetSpider.Extraction/Model/ModelDefinition.cs @@ -1,4 +1,5 @@ using DotnetSpider.Extraction.Model.Attribute; +using Newtonsoft.Json; using System; using System.Collections.Generic; using System.Linq; @@ -43,6 +44,7 @@ public class ModelDefinition : IModel /// public IEnumerable SharedValueSelectors { get; protected set; } + [JsonIgnore] public string Identity { get; protected set; } public ModelDefinition(Selector selector, IEnumerable fields, TableInfo table, diff --git a/src/DotnetSpider.Extraction/Model/Selector.cs b/src/DotnetSpider.Extraction/Model/Selector.cs index 1a2b32712..8631a844b 100644 --- a/src/DotnetSpider.Extraction/Model/Selector.cs +++ b/src/DotnetSpider.Extraction/Model/Selector.cs @@ -1,10 +1,16 @@ -namespace DotnetSpider.Extraction.Model +using Newtonsoft.Json; +using Newtonsoft.Json.Converters; + +namespace DotnetSpider.Extraction.Model { /// /// 选择器特性 /// public class Selector : System.Attribute { + [JsonIgnore] + public override object TypeId => base.TypeId; + /// /// 构造方法 /// diff --git a/src/DotnetSpider.Extraction/Model/TableNamePostfix.cs b/src/DotnetSpider.Extraction/Model/TableNamePostfix.cs index d9ee4d070..a37765615 100644 --- a/src/DotnetSpider.Extraction/Model/TableNamePostfix.cs +++ b/src/DotnetSpider.Extraction/Model/TableNamePostfix.cs @@ -1,5 +1,9 @@ -namespace DotnetSpider.Extraction.Model +using Newtonsoft.Json; +using Newtonsoft.Json.Converters; + +namespace DotnetSpider.Extraction.Model { + [JsonConverter(typeof(StringEnumConverter))] public enum TableNamePostfix { None, diff --git a/src/DotnetSpider.Extraction/SelectorType.cs b/src/DotnetSpider.Extraction/SelectorType.cs index 450fbf1fb..246c11484 100644 --- a/src/DotnetSpider.Extraction/SelectorType.cs +++ b/src/DotnetSpider.Extraction/SelectorType.cs @@ -1,4 +1,6 @@ -using System; +using Newtonsoft.Json; +using Newtonsoft.Json.Converters; +using System; namespace DotnetSpider.Extraction { @@ -6,6 +8,7 @@ namespace DotnetSpider.Extraction /// 查询器类型 /// [Flags] + [JsonConverter(typeof(StringEnumConverter))] public enum SelectorType { /// diff --git a/src/DotnetSpider.Node/DotnetSpider.Node.csproj b/src/DotnetSpider.Node/DotnetSpider.Node.csproj new file mode 100644 index 000000000..2593952a6 --- /dev/null +++ b/src/DotnetSpider.Node/DotnetSpider.Node.csproj @@ -0,0 +1,10 @@ + + + + netcoreapp2.1 + + Exe + + + + diff --git a/src/DotnetSpider.Node/Program.cs b/src/DotnetSpider.Node/Program.cs new file mode 100644 index 000000000..19279655d --- /dev/null +++ b/src/DotnetSpider.Node/Program.cs @@ -0,0 +1,15 @@ +using System; +using System.Linq; + +namespace DotnetSpider.Node +{ + class Program + { + static void Main(string[] args) + { + if (args.Contains("daemon")) + { + } + } + } +} diff --git a/src/DotnetSpider.Sample/Program.cs b/src/DotnetSpider.Sample/Program.cs index 6d0d501b9..869150e1f 100644 --- a/src/DotnetSpider.Sample/Program.cs +++ b/src/DotnetSpider.Sample/Program.cs @@ -1,4 +1,5 @@ using DotnetSpider.Core; +using DotnetSpider.Sample.docs; using System.Threading; namespace DotnetSpider.Sample @@ -12,7 +13,8 @@ static void Main(string[] args) #else ThreadPool.SetMinThreads(256, 256); #endif - Startup.Run(args); + + ModelSpider.Run(); } /// diff --git a/src/DotnetSpider.Sample/docs/ModelSpider.cs b/src/DotnetSpider.Sample/docs/ModelSpider.cs index f5efcaefb..8eb8d85c8 100644 --- a/src/DotnetSpider.Sample/docs/ModelSpider.cs +++ b/src/DotnetSpider.Sample/docs/ModelSpider.cs @@ -6,6 +6,7 @@ using DotnetSpider.Extraction; using DotnetSpider.Extraction.Model; using DotnetSpider.Extraction.Model.Attribute; +using Newtonsoft.Json; namespace DotnetSpider.Sample.docs { @@ -23,7 +24,7 @@ public static void Run() }; var targetRequestSelector = new TargetRequestSelector("//ul[@class='yk-pages']"); var model = new ModelDefinition(selector, fields, table, targetRequestSelector); - + var json = JsonConvert.SerializeObject(model); // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等 var site = new Site { EncodingName = "UTF-8" }; for (int i = 1; i < 5; ++i)